HEAD
## import gtex medians data
temp <- tempfile()
download.file("https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz",temp)
gtex <- read.table( temp, skip=2, header = TRUE, sep = "\t")
gtex_rowinfo <- data.frame(Name=gtex$Name, Description=gtex$Description)
rownames(gtex) <- gtex$Name
gtex <- as.matrix(gtex[,3:ncol(gtex)])
unlink(temp); rm(temp)
## import ensembl gene data
ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl", GRCh=37)
genes <- getBM(attributes=c('chromosome_name','start_position','end_position','hgnc_symbol', 'ensembl_gene_id','gene_biotype'),
filters = list('biotype'='protein_coding'),
mart = ensembl, useCache = F)
genes <- genes[which(is.element(genes$chromosome_name, c(1:22, "X", "Y", "MT")) & genes$hgnc_symbol != "" ) ,]
## show mitochondrial genes drive a large part of sample similarity
## Note sum of medians in gtex not quite 1M - likely artifact of taking medians
temp <- data.frame(names=colnames(gtex), total_transcripts=colSums(gtex))
ggplot( temp ,aes(y=total_transcripts, x=names))+
geom_col()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## match genes between gtex and ensembl
gtex_names <- str_split_fixed(gtex_rowinfo$Name, "[.]", 2)[,1]
which <- which(genes$chromosome_name != "MT" )
gtex_cleaned <- gtex[which(is.element(gtex_names, genes$ensembl_gene_id[which])),]
which <- which(genes$chromosome_name == "MT" )
gtex_cleanedMT <- gtex[which(is.element(gtex_names, genes$ensembl_gene_id[which])),]
##non-mitochondrial TPM sum
temp <- data.frame(names=colnames(gtex_cleaned), total_transcripts=colSums(gtex_cleaned))
ggplot( temp ,aes(y=total_transcripts, x=names))+
geom_col()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
##mitochondrial TPM sum
temp <- data.frame(names=colnames(gtex_cleanedMT), total_transcripts=colSums(gtex_cleanedMT))
ggplot( temp ,aes(y=total_transcripts, x=names))+
geom_col()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
rm(gtex_cleanedMT)
##renormalize TPM without mitochondrial genes
for(i in 1:ncol(gtex_cleaned)){
gtex_cleaned[,i] <- (gtex_cleaned[,i]*1e6 / sum(gtex_cleaned[,i]))
}
temp <- data.frame(names=colnames(gtex_cleaned), total_transcripts=colSums(gtex_cleaned))
ggplot( temp ,aes(y=total_transcripts, x=names))+
geom_col()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gtex <- gtex_cleaned; rm(gtex_cleaned)
exp_mat <- gtex
## log10(TPM+1) transform of data
exp_mat <- log10(exp_mat+1)
## remove mitochondrial contribution
grid.arrange(grobs=grobl_supp, nrow=4, heights=c(5,3,3,3))
## get gene ids (rows from df_1b) that correspond to given GO id/ GO label
## generate small set of brain related GO id/ GO labels
## plot on one v all plot
genes <- getBM(attributes=c('hgnc_symbol', 'ensembl_gene_id','go_id','name_1006'),
mart = ensembl, useCache = F)
#genes <- load("genes_w_go.RData")
allOE_genes<-str_split_fixed(rownames(exp_mat),"[.]",2)[,1]
ggl <- list()
for(measure in specificity_measures$func_names){
temp_df <- df_1b[which( df_1b$measure == measure),]
temp_df_sub <- slice_max(temp_df, order_by=delta, n=nrow(temp_df)/100 )
sigOE_genes <- str_split_fixed(temp_df_sub$Var1,"[.]",2)[,1]
ego <- enrichGO(gene = sigOE_genes,
universe = allOE_genes,
keyType = "ENSEMBL",
OrgDb = org.Hs.eg.db,
ont = "BP",
pAdjustMethod = "BH",
qvalueCutoff = 0.05,
readable = TRUE,
pool=TRUE)
ggl[[measure]] <- ggplot(temp_df, aes(x=all, y=one))+
geom_point(size=0.1)+
geom_point(data=temp_df_sub, aes( x=all, y=one), color="red")+
ggtitle(measure)+
geom_abline(slope=1,intercept=0, color="red", size=0.3)+
theme_bw()+
theme(panel.grid.minor.x = element_blank())
n=10
filename <- paste(figures_dir,"/",measure,"_top1percent_deltas_go.png",sep = "")
ego <- enrichplot::pairwise_termsim(ego)
p1 <- ggl[[measure]]
p2 <- enrichplot::dotplot(ego, showCategory = n, font.size=8, label_format=15)+
theme(legend.position = "none")
p3 <- emapplot(ego, showCategory = n, cex_label_category=0.5, cex_category=0.5, cex_line=0.5 )+
scale_y_discrete(labels=str_wrap(ego@result$Description[1:n], width=15 ))
grid.arrange(p1, p2,p3, ncol=3) #, widths = c(3,3,4))
#dev.off()
}
#> wrong orderBy parameter; set to default `orderBy = "x"`
#> Warning: Removed 25284 rows containing missing values (geom_point).
#> wrong orderBy parameter; set to default `orderBy = "x"`
#> Warning: Removed 602 rows containing missing values (geom_point).
#> wrong orderBy parameter; set to default `orderBy = "x"`
#> Warning: Removed 602 rows containing missing values (geom_point).
#> wrong orderBy parameter; set to default `orderBy = "x"`
#> Warning: Removed 602 rows containing missing values (geom_point).
## dot similarity from initial Z scores
dot_sim <- similarity_func(exp_mat)
sim_tree <- cluster_func(dot_sim)
heatmap(dot_sim, Rowv = rev(sim_tree), Colv = sim_tree, scale = "none", margins=c(11,13), symm=T)
sim_tree <- cluster_func(dot_sim)
## plot similarity tree with weights
par(mar=c(2,2,2,15))
sim_tree %>%
dendextend::set("nodes_pch",19) %>%
dendextend::set("nodes_cex",2* sqrt(get_nodes_attr(sim_tree,"weight"))) %>%
plot(horiz=T)
## take a look at the weights
temp <- get_weights(sim_tree, get_leaves_attr(sim_tree, "label"))
temp <- data.frame(weights=temp, names=factor(names(temp), levels=rev(names(temp)) ) )
g <- ggplot( temp ,aes(y=fct_rev(as.factor(names)), x=weights))+
geom_col() #+
#theme(axis.text.y = element_text(angle = 90, vjust = 0.5, hjust=1))
g
weights <- get_weights(sim_tree, colnames(exp_mat))
spec_mat_weighted <- calc_weighted_zscore_matrix(exp_mat, weights = weights)
flat <- weights; flat[1:length(flat)] <- 1
spec_mat_flat <- calc_weighted_zscore_matrix(exp_mat, flat)
## plot to look at global dif between flat and weighted
tau_flat <- calc_weighted_tau(exp_mat, flat)
tau_weighted <- calc_weighted_tau(exp_mat, weights)
tsi_flat <- calc_weighted_tsi(exp_mat,flat)
tsi_weighted <- calc_weighted_tsi(exp_mat,weights)
gini_flat <- calc_weighted_gini(exp_mat, flat)
gini_weighted <- calc_weighted_gini(exp_mat, weights)
hist(spec_mat_flat ,breaks=100)
hist(spec_mat_weighted ,breaks=100)
hist(tau_flat ,breaks=100)
hist(tau_weighted ,breaks=100)
hist(tsi_flat ,breaks=100)
hist(tsi_weighted ,breaks=100)
hist(gini_flat ,breaks=100)
hist(gini_weighted ,breaks=100)
hist(spec_mat_weighted - spec_mat_flat ,breaks=100)
hist(tau_weighted - tau_flat ,breaks=100)
hist(tsi_weighted - tsi_flat ,breaks=100)
hist(gini_weighted - gini_flat ,breaks=100)
## load data instead
if(TRUE){ ## NOTE! This chunk can require a lot of memory and time depending if running sequential. If you want to look at robustness test results it is recommended you have enabled parallel computing (e.g. check getDoParWorkers > 3 or 4 (preferably more))
## Use this variable to control whether performing random or true brain-non-brain partition
random_partition = FALSE
robustness_test_results <- list( )
for(i in 1:length(specificity_measures$func_names)){
el_names <- paste( c( "P1_", "P2_")
, specificity_measures$func_names[i], sep="")
temp_list <- setNames(list(list(), list()), nm=el_names )
robustness_test_results <- c(robustness_test_results, temp_list )
}
num_brain_samples <- length(colnames(exp_mat)[grep("[Bb]rain",colnames(exp_mat))])
num_reps <- 3
## use general similarity_func and cluster_func in case we want to test more later
similarity_func <- function(exp_mat){calc_dot_product_similarity_matrix(calc_zscore_matrix(exp_mat))}
cluster_func <- function(sim_mat){add_weights(add_dist_to_parent(as.dendrogram(hclust(as.dist(1-sim_mat), method = "single") ) ))}
## to switch analysis between True Brain Partition and equalivalent sized Random Partition set random_partition ##
if(random_partition){
which <- sample(1:ncol(exp_mat), length(grep("[Bb]rain",colnames(exp_mat),invert = TRUE)))
P1 <- as.matrix(exp_mat[, which])
notP1 <- as.matrix(exp_mat[, which(!is.element(1:ncol(exp_mat),which ))])
}else{
## P1 and notP1 are contant throughout, so set these outside loops
P1 <- as.matrix(exp_mat[,grep("[Bb]rain",colnames(exp_mat), invert = TRUE)])
notP1 <- as.matrix(exp_mat[,grep("[Bb]rain",colnames(exp_mat), invert = FALSE)])
}
### define 1-n trajectories ahead of time since i+1 should be referenced against i.
## do foreach over each element of permuation matrix ## use arrayInd to get row/col given perm number
permutation_matrix <- matrix(nrow = num_brain_samples, ncol = num_reps )
for(rep in 1:num_reps){
permutation_matrix[,rep] <- sample(colnames(notP1),num_brain_samples, replace = FALSE)
}
if(!random_partition){
load("final_permutation_matrix") ## reproduce same used in paper figures
}
if(random_partition){
load("final_random_permutation_matrix") ## reproduce same used in paper figures
}
for(measure in specificity_measures$func_names){
specificity_func <- specificity_measures$funcs[[measure]]
if(!is.function(specificity_func)){print(paste(measure, "func is not a function")); next;}
results <- foreach(perm=1:length(permutation_matrix), .errorhandling = "pass", .final = function(x) setNames(x,paste("rep=", arrayInd(1:length(permutation_matrix), dim(permutation_matrix))[,2], ",n=", arrayInd(1:length(permutation_matrix), dim(permutation_matrix ))[,1], sep = ""))) %dopar% {
library(Hmisc)
library(dendextend)
library(reldist)
sample_indices <- 1:(arrayInd(perm, dim(permutation_matrix))[,1]) ## row is sample name, all names in a row up to top are choices for n=1,2..i
rep_index <- arrayInd(perm, dim(permutation_matrix))[,2] ## column is replicate number
sample_set <- permutation_matrix[sample_indices, rep_index]
# P1_baseline is just P1 at n=1
P2 <- notP1[,sample_set, drop=F]; colnames(P2) <- make.unique(colnames(P2)) ## make.unique will facilitate sampling with replacement
P1uP2 <- cbind(P1,P2)
## P1uP2 ## exp_mat > get sim mat > get sim tree > get samp weights > get spec mat
sim_mat <- similarity_func(P1uP2)
sim_tree <- cluster_func(sim_mat)
weights <- get_weights(sim_tree, colnames(P1uP2))
flat <- weights; flat[] <- 1
specificity_flat <- specificity_func(P1uP2, flat)
specificity_weighted <- specificity_func(P1uP2, weights)
# need to fill in these with spec_score matrices for each n for each method
# names should match
### restore once error fixed
if(specificity_measures$out_type[[measure]]=="matrix"){
result <- list(
P1_weighted_spec_score = specificity_weighted[,colnames(P1), drop=F],
P1_flat_spec_score = specificity_flat[,colnames(P1), drop=F],
P2_weighted_spec_score = specificity_weighted[,colnames(P2), drop=F],
P2_flat_spec_score = specificity_flat[,colnames(P2), drop=F]
)
}else if(specificity_measures$out_type[[measure]]=="vector"){
result <- list(P1_weighted_spec_score = specificity_weighted,
P1_flat_spec_score = specificity_flat,
P2_weighted_spec_score = specificity_weighted,
P2_flat_spec_score = specificity_flat
)
}
}
results_P1 <- list()
results_P2 <- list()
for(i in 1:length(results)){
name <- names(results)[i]
rep <- str_split_fixed(names(results)[1], "[=,]", n=4)[,2]
n <- str_split_fixed(names(results)[1], "[=,]", n=4)[,4]
results_P1[[name]] <- list(P1_weighted_spec_score = results[[name]]$P1_weighted_spec_score, P1_flat_spec_score = results[[name]]$P1_flat_spec_score)
results[[name]]$P1_weighted_spec_score <- NULL; results[[name]]$P1_flat_spec_score <- NULL; ## large object so delete as we go to avoid copy
results_P2[[name]] <- list(P2_weighted_spec_score = results[[name]]$P2_weighted_spec_score, P2_flat_spec_score = results[[name]]$P2_flat_spec_score)
results[[name]]$P2_weighted_spec_score <- NULL; results[[name]]$P2_flat_spec_score<- NULL; ## large object so delete as we go to avoid copy
}
robustness_test_results[[paste("P1_", measure,sep = "")]] <- results_P1; rm(results_P1)
robustness_test_results[[paste("P2_", measure,sep = "")]] <- results_P2; rm(results_P2)
}
} ## end of chunk control
## output to look at is robustness_test_results
#save.image()
df_robustness_test_results <- data.frame(measure=character(),
P1orP2=character(),
weight_or_flat=character(),
n=numeric(),
rep=numeric(),
variance=numeric()
)
for(el in 1:length(robustness_test_results)){
if(length(robustness_test_results[[el]])==0){next;}
temp_name <- str_split_fixed(names(robustness_test_results[el]),"_",2)
temp_len <- length(robustness_test_results[[el]]) ## each has weighted and flat
temp_df <- data.frame(measure=rep(temp_name[2],temp_len),
P1orP2=rep(temp_name[1],temp_len),
weight_or_flat=character(temp_len),
n=numeric(temp_len),
rep=numeric(temp_len),
variance=numeric(temp_len))
temp_df_list <- list("weighted"=temp_df, "flat"=temp_df)
temp_df_list[["weighted"]]$weight_or_flat <- "weighted"
temp_df_list[["flat"]]$weight_or_flat <- "flat"
## build the temp_dfs in these loops
for(i in 1:length(robustness_test_results[[el]])){
for(f_or_w in c("flat","weighted")){
temp_name <- str_split_fixed(names(robustness_test_results[[el]][i]),"[,=]",4)
temp_df_list[[f_or_w]]$rep[i] <- as.numeric(temp_name[,grep("rep", temp_name)+1])
temp_df_list[[f_or_w]]$n[i] <- as.numeric(temp_name[,grep("n", temp_name)+1])
temp <- robustness_test_results[[el]][[i]]
temp <- temp[[grep(f_or_w, names(temp))]]
## baseline matched on replicate, and is set at n=1
temp_baseline <- robustness_test_results[[el]][[paste("rep=",temp_name[,grep("rep", temp_name)+1],",n=1",sep="")]]
temp_baseline <- temp_baseline[[grep(f_or_w, names(temp_baseline))]]
temp_df_list[[f_or_w]]$variance[i] <- var(as.vector(as.matrix(temp)) - as.vector(as.matrix(temp_baseline)), na.rm=T)
}
}
## compile temp_dfs into final result
df_robustness_test_results <- rbind(df_robustness_test_results, temp_df_list[["weighted"]], temp_df_list[["flat"]])
}
df_2c <- df_robustness_test_results
df_2c_summary_by_partition <- aggregate(df_2c$variance,
by=list(measure=df_2c$measure,
P1orP2=df_2c$P1orP2,
weight_or_flat= df_2c$weight_or_flat,
n=df_2c$n),
FUN=function(x) mean(x,na.rm=TRUE))
names(df_2c_summary_by_partition)[ncol(df_2c_summary_by_partition)] <- "mean_var"
df_2c_summary_by_partition$sd_of_var <- aggregate(df_2c$variance,
by=list(measure=df_2c$measure,
P1orP2=df_2c$P1orP2,
weight_or_flat= df_2c$weight_or_flat,
n=df_2c$n),
FUN=function(x) sd(x,na.rm=TRUE))$x
temp <- df_2c_summary_by_partition
df_2c_summary_by_partition$y_min <- temp$mean_var-temp$sd_of_var/sqrt(num_reps)
df_2c_summary_by_partition$y_max <- temp$mean_var+temp$sd_of_var/sqrt(num_reps)
ggl <- list()
## Zscore first
for(measure in c("Zscore", "notZscore")){
if(measure == "Zscore"){
which <- which(df_2c_summary_by_partition$measure == measure)
which2 <- which(df_robustness_test_results$measure == measure)
} else {
which <- which(df_2c_summary_by_partition$measure != "Zscore")
which2 <- which(df_robustness_test_results$measure != "Zscore")
}
gg <- ggplot(df_2c_summary_by_partition[which,], aes(x=n, y=mean_var)) +
scale_color_manual(values=c("grey20","steelblue3"))+
scale_fill_manual(values=c("grey20","steelblue3"))+
geom_ribbon(aes( group=weight_or_flat, ymin=y_min, ymax=y_max, fill=weight_or_flat),alpha=0.5)+
geom_line(aes( group=weight_or_flat, ymin=y_min, ymax=y_max, color=weight_or_flat, fill=weight_or_flat))+
geom_point(data = df_robustness_test_results[which2,], size=0.2,
aes(x=n, y=variance,group=weight_or_flat,color=weight_or_flat, fill=weight_or_flat))+
theme_bw()+
theme(panel.grid.minor.x = element_blank())+
scale_x_continuous(limits = c(0,13), breaks=seq(0,12,by=1), expand = c(0,0))
if(measure == "Zscore"){
gg <- gg+facet_wrap(measure ~ P1orP2 , scales = "free",
labeller=labeller(P1orP2=c("P1"="P1 (non-brain)", "P2"="P2 (brain)")))
gg <- gg+theme(legend.position = "none")
} else {
gg <- gg+facet_wrap( ~ measure, ncol = 3)
gg <- gg+theme(legend.position="bottom")
}
ggl[[measure]] <- gg
}
#> Warning: Ignoring unknown aesthetics: ymin, ymax, fill
#> Warning: Ignoring unknown aesthetics: ymin, ymax, fill
grobl <- list()
for(n in names(ggl)){
grobl[[n]] <- ggplotGrob(ggl[[n]])
}
grid.arrange(grobs=grobl, nrow=2)
### stats reported in paper
## paired t-test for difference in means
### t = delta / sd / sqrt(n)
which_list <- list()
which_list[["Z_P1_flat"]] <- which(df_2c$measure=="Zscore"
& df_2c$P1orP2=="P1"
& df_2c$n == max(df_2c$n)
& df_2c$weight_or_flat=="flat")
which_list[["Z_P1_weighted"]] <- which(df_2c$measure=="Zscore"
& df_2c$P1orP2=="P1"
& df_2c$n == max(df_2c$n)
& df_2c$weight_or_flat=="weighted")
which_list[["Z_P2_flat"]] <- which(df_2c$measure=="Zscore"
& df_2c$P1orP2=="P2"
& df_2c$n == max(df_2c$n)
& df_2c$weight_or_flat=="flat")
which_list[["Z_P2_weighted"]] <- which(df_2c$measure=="Zscore"
& df_2c$P1orP2=="P2"
& df_2c$n == max(df_2c$n)
& df_2c$weight_or_flat=="weighted")
print("t-test P1 weighted / P1 flat : %diff between weighted and flat")
#> [1] "t-test P1 weighted / P1 flat : %diff between weighted and flat"
## t.test on logs gives ratio estimates
temp <- t.test(x=log10(df_2c[which_list$Z_P1_weighted,]$variance),y=log10(df_2c[which_list$Z_P1_flat,]$variance) ,paired=TRUE, conf.level = 0.95)
## antilog to recover ratio of weighted / flat
1-10^temp$estimate
#> mean of the differences
#> 0.7932209
1-10^temp$conf.int
#> [1] 0.8021299 0.7839107
#> attr(,"conf.level")
#> [1] 0.95
### 79.3% lower variance in weighted than flat (CI)
print("t-test P2 weighted / P2 flat : %diff between weighted and flat")
#> [1] "t-test P2 weighted / P2 flat : %diff between weighted and flat"
temp <- t.test(x=log10(df_2c[which_list$Z_P2_weighted,]$variance),y=log10(df_2c[which_list$Z_P2_flat,]$variance) ,paired=TRUE, conf.level = 0.95)
1-10^temp$estimate
#> mean of the differences
#> 0.3578047
1-10^temp$conf.int
#> [1] 0.4196920 0.2893173
#> attr(,"conf.level")
#> [1] 0.95
###
## jaccard between n = 1 and n = num_brain_samples for brain v non-brain OR P1 v P2
df_2d <- data.frame(measure=character(), n=numeric(),
rep=numeric(), cut_point=numeric(),
jaccard=numeric(), P1orP2=character(),
weighted=logical(), samp=character(),
gene_count=numeric(), gene_count_baseline=numeric(),
gene_count_intersect=numeric())
## nested for loop builds df of jaccard results for plotting
Sys.time()
#> [1] "2022-01-02 13:47:37 PST"
for(el in names(robustness_test_results)){
if(length(robustness_test_results[[el]])==0 ){next;}
P1orP2 <- str_split_fixed(el,"_",n=2)[1]
measure <- str_split_fixed(el,"_",n=2)[2]
if(measure == "Zscore"){cuts <- seq(0,4,0.5) ## set cutoffs
}else{ cuts <- seq(0,1,0.05)}
# for(i in 1:length(robustness_test_results[[el]]) ){ ### foreach at this level if implementing foreach
df_temp <- foreach(i=1:length(robustness_test_results[[el]]), .errorhandling = "pass", .combine = rbind) %dopar% {
#foreach(perm=1:length(permutation_matrix), .errorhandling = "pass", .final = function(x) setNames(x,paste("rep=", arrayInd(1:length(permutation_matrix), dim(permutation_matrix))[,2], ",n=", arrayInd(1:length(permutation_matrix), dim(permutation_matrix ))[,1], sep = ""))) %dopar% {
library(stringr)
df_internal <- data.frame(measure=character(), n=numeric(),
rep=numeric(), cut_point=numeric(),
jaccard=numeric(), P1orP2=character(),
weighted=logical(), samp=character(),
gene_count=numeric(), gene_count_baseline=numeric(),
gene_count_intersect=numeric())
n = as.numeric(str_split_fixed(names(robustness_test_results[[el]][i]), "[=,]", n=4 )[,4])
rep = as.numeric(str_split_fixed(names(robustness_test_results[[el]][i]), "[=,]", n=4 )[,2])
for(weighted_or_flat in c("weighted","flat")){
baseline_name <- names(robustness_test_results[[el]])[grep(paste("rep=",rep,",n=",1,"$", sep = ""),names(robustness_test_results[[el]]) )]
if(specificity_measures$out_type[measure] == "matrix"){
weighted_flat_index <- grep(weighted_or_flat,names(robustness_test_results[[el]][[baseline_name]]))
baseline_mat <- robustness_test_results[[el]][[baseline_name]][[weighted_flat_index]]
weighted_flat_index <- grep(weighted_or_flat,names(robustness_test_results[[el]][[i]]))
temp_mat <- robustness_test_results[[el]][[i]][[weighted_flat_index]][,colnames(baseline_mat),drop=FALSE]
}else if(specificity_measures$out_type[measure] == "vector"){
weighted_flat_index <- grep(weighted_or_flat,names(robustness_test_results[[el]][[baseline_name]]))
baseline_mat <- as.matrix(robustness_test_results[[el]][[baseline_name]][[weighted_flat_index]])
weighted_flat_index <- grep(weighted_or_flat,names(robustness_test_results[[el]][[i]]))
temp_mat <- as.matrix(robustness_test_results[[el]][[i]][[weighted_flat_index]])
}else{ stop("WARNING: no out type associated with measure") }
ncols <- ncol(temp_mat); ncuts <- length(cuts)
temp_df <- data.frame(measure=rep(measure, ncols*ncuts), n=rep(n, ncols*ncuts),
rep=rep(rep, ncols*ncuts), cut_point=numeric(length=ncols*ncuts),
jaccard=numeric(length=ncols*ncuts), P1orP2=rep(P1orP2, ncols*ncuts),
weighted=rep( grepl("weighted", weighted_or_flat), ncols*ncuts),
samp=character(length=ncols*ncuts),
gene_count=numeric(length=ncols*ncuts), gene_count_baseline=numeric(length=ncols*ncuts),
gene_count_intersect=numeric(length=ncols*ncuts) )
for(s in 1:ncol(temp_mat) ){ ## add a row to df for each sample
for(cut_index in 1:length(cuts)){
ind <- (s-1)*length(cuts)+cut_index ### use arr_ind here instead
temp_df[ind,]$cut_point <- cuts[cut_index]
if(cuts[cut_index]<0){
temp_df[ind,]$jaccard <- ( length( which( baseline_mat[,s] <= cuts[cut_index] & temp_mat[,s] <= cuts[cut_index] )) /
length( which( baseline_mat[,s] <= cuts[cut_index] | temp_mat[,s] <= cuts[cut_index] )) )
temp_df[ind,]$gene_count <- length( which(temp_mat[,s] <= cuts[cut_index] ) )
temp_df[ind,]$gene_count_baseline <- length( which(baseline_mat[,s] <= cuts[cut_index] ) )
temp_df[ind,]$gene_count_intersect <- length( which( baseline_mat[,s] <= cuts[cut_index] & temp_mat[,s] <= cuts[cut_index] ))
} else {
temp_df[ind,]$jaccard <- ( length( which( baseline_mat[,s] >= cuts[cut_index] & temp_mat[,s] >= cuts[cut_index] )) /
length( which( baseline_mat[,s] >= cuts[cut_index] | temp_mat[,s] >= cuts[cut_index] )) )
temp_df[ind,]$gene_count <- length( which(temp_mat[,s] >= cuts[cut_index] ) )
temp_df[ind,]$gene_count_baseline <- length( which(baseline_mat[,s] >= cuts[cut_index] ) )
temp_df[ind,]$gene_count_intersect <- length( which( baseline_mat[,s] >= cuts[cut_index] & temp_mat[,s] >= cuts[cut_index] ))
}
temp_df[ind,]$samp <- ifelse(!is.null((colnames(baseline_mat[,s,drop=FALSE] ) )),
unique(colnames(baseline_mat[,s,drop=FALSE] ),
colnames(temp_mat[,s,drop=FALSE] ) ), NA )
}
}
df_internal <- rbind(df_internal, temp_df)
}
df_internal
}
df_2d <- rbind(df_2d, df_temp)
}
Sys.time()
#> [1] "2022-01-02 14:24:56 PST"
df_2d_summary_by_partition <-aggregate(df_2d$jaccard, by=list(measure=df_2d$measure, n=df_2d$n,
cut_point=df_2d$cut_point, P1orP2=df_2d$P1orP2,
weighted=df_2d$weighted),
FUN=function(x) mean(x,na.rm=TRUE) )
names(df_2d_summary_by_partition)[length(names(df_2d_summary_by_partition))] <- "jaccard_mean"
df_2d_summary_by_partition$jaccard_sd <- aggregate(df_2d$jaccard, by=list(measure=df_2d$measure, n=df_2d$n,
cut_point=df_2d$cut_point, P1orP2=df_2d$P1orP2,
weighted=df_2d$weighted) ,
FUN=function(x) sd(x,na.rm = TRUE) )$x
df_2d_summary_by_partition$gene_count <- aggregate(df_2d$gene_count, by=list(measure=df_2d$measure, n=df_2d$n,
cut_point=df_2d$cut_point, P1orP2=df_2d$P1orP2,
weighted=df_2d$weighted)
, FUN=function(x) mean(x,na.rm = TRUE) )$x
df_2d_summary_by_partition$gene_count_baseline <- aggregate(df_2d$gene_count_baseline, by=list(measure=df_2d$measure, n=df_2d$n,
cut_point=df_2d$cut_point, P1orP2=df_2d$P1orP2,
weighted=df_2d$weighted),
FUN=function(x) mean(x,na.rm = TRUE) )$x
df_2d_summary_by_partition$gene_count_intersect <- aggregate(df_2d$gene_count_intersect, by=list(measure=df_2d$measure, n=df_2d$n,
cut_point=df_2d$cut_point, P1orP2=df_2d$P1orP2,
weighted=df_2d$weighted),
FUN=function(x) mean(x,na.rm = TRUE) )$x
ggl_jacc <- list()
for(measure in c("Zscore", "notZscore")){
if(measure == "Zscore"){
which <- which(df_2d_summary_by_partition$measure == measure)
} else {
which <- which(df_2d_summary_by_partition$measure != "Zscore")
}
gg <- ggplot(df_2d_summary_by_partition[which,],
aes(x=cut_point, y=jaccard_mean,
ymin=jaccard_mean-jaccard_sd/num_reps,
ymax=jaccard_mean+jaccard_sd/num_reps,
group=n, fill=n))+
theme_bw()+
theme(
panel.grid.minor.x = element_blank())+
#ggtitle(paste("jaccard index for ", measure, " P1 and P2, weighted and flat measures",sep=""))+
geom_line(aes(color=n))+
scale_color_continuous(low="powderblue", high="steelblue4")+
scale_y_continuous(limits = c(0,1), breaks=seq(0,1,by=0.2))+
facet_grid(P1orP2 ~ weighted, labeller=labeller(weighted=c("TRUE"="weighted", "FALSE"="flat")) )
if(measure == "Zscore"){
gg <- gg+scale_x_continuous(limits = c(0,4), breaks=seq(0,4,by=0.5), expand = c(0,0))
gg <- gg + facet_grid(weighted ~ P1orP2, labeller=labeller(weighted=c("TRUE"="weighted", "FALSE"="flat")) )
} else {
gg <- gg+scale_x_continuous(limits = c(0,1), breaks=seq(0,1,by=0.1) ,expand = c(0,0))
gg <- gg + facet_grid(weighted ~ measure, labeller=labeller(weighted=c("TRUE"="weighted", "FALSE"="flat")) )
}
ggl_jacc[[measure]] <- gg
}
grobl <- list()
for(n in names(ggl_jacc)){
grobl[[n]] <- ggplotGrob(ggl_jacc[[n]])
}
grid.arrange(grobs =list(grobl$Zscore,grobl$notZscore), nrow=2)
## for random supplemental
if(random_partition){
grid.arrange(grobs = grobl, nrow=2, heights=c(5,4) )
}
ggl_count <- list()
for(measure in c("Zscore", "notZscore")){
if(measure == "Zscore"){
which <- which(df_2d_summary_by_partition$measure == measure)
} else {
which <- which(df_2d_summary_by_partition$measure != "Zscore")
}
gg <- ggplot(df_2d_summary_by_partition[which,],
aes(x=cut_point, y=gene_count / nrow(genes),
group=n, fill=n))+
theme_bw()+
theme(
panel.grid.minor.x = element_blank())+
geom_line(aes(color=n))+
scale_color_continuous(low="steelblue2", high="steelblue4")+
#scale_y_log10( #limits = c(1,3e4),
scale_y_continuous( limits=c(1e-4,1) , trans = log10_trans(),
breaks = trans_breaks("log10", function(x) 10^x), # ) #,
labels = percent, )
if(measure == "Zscore"){
gg <- gg+scale_x_continuous(limits = c(0,4), breaks=seq(0,4,by=0.5), expand = c(0,0))
gg <- gg + facet_grid(weighted ~ P1orP2, labeller=labeller(weighted=c("TRUE"="weighted", "FALSE"="flat")) )
} else {
gg <- gg+scale_x_continuous(limits = c(0,1), breaks=seq(0,1,by=0.1) ,expand = c(0,0))
gg <- gg + facet_grid(weighted ~ measure, labeller=labeller(weighted=c("TRUE"="weighted", "FALSE"="flat")) )
}
ggl_count[[measure]] <- gg
}
grobl <- list()
for(n in names(ggl_count)){
grobl[[n]] <- ggplotGrob(ggl_count[[n]])
}
#> Warning: Transformation introduced infinite values in continuous y-axis
grid.arrange(grobs = list(grobl$Zscore, grobl$notZscore), nrow=2)
## for random supplemental
if(random_partition){
grid.arrange(grobs = grobl, nrow=2, heights=c(5,4) )
}
### stats reported in paper
## paired t-test for difference in means
### t = delta / sd / sqrt(n)
which_list <- list()
which_list[["Z_P1_flat"]] <- which(df_2d$measure=="Zscore"
& df_2d$n==max(df_2d$n)
& df_2d$cut_point==2
& df_2d$P1orP2=="P1"
& df_2d$weighted==FALSE)
which_list[["Z_P1_weighted"]] <- which(df_2d$measure=="Zscore"
& df_2d$n==max(df_2d$n)
& df_2d$cut_point==2
& df_2d$P1orP2=="P1"
& df_2d$weighted==TRUE)
which_list[["Z_P2_flat"]] <- which(df_2d$measure=="Zscore"
& df_2d$n==max(df_2d$n)
& df_2d$cut_point==2
& df_2d$P1orP2=="P2"
& df_2d$weighted==FALSE)
which_list[["Z_P2_weighted"]] <- which(df_2d$measure=="Zscore"
& df_2d$n==max(df_2d$n)
& df_2d$cut_point==2
& df_2d$P1orP2=="P2"
& df_2d$weighted==TRUE)
print("t-test P2 flat jaccard")
#> [1] "t-test P2 flat jaccard"
temp <- t.test(x=df_2d[which_list$Z_P2_flat,]$jaccard, conf.level = 0.95)
temp$estimate
#> mean of x
#> 0.2513928
temp$conf.int
#> [1] 0.1101765 0.3926090
#> attr(,"conf.level")
#> [1] 0.95
print("t-test P2 weighted jaccard")
#> [1] "t-test P2 weighted jaccard"
temp <- t.test(x=df_2d[which_list$Z_P2_weighted,]$jaccard, conf.level = 0.95)
temp$estimate
#> mean of x
#> 0.6359852
temp$conf.int
#> [1] 0.5740172 0.6979532
#> attr(,"conf.level")
#> [1] 0.95
print("t-test P1 flat jaccard")
#> [1] "t-test P1 flat jaccard"
temp <- t.test(x=df_2d[which_list$Z_P1_flat,]$jaccard, conf.level = 0.95)
temp$estimate
#> mean of x
#> 0.6495905
temp$conf.int
#> [1] 0.6367873 0.6623936
#> attr(,"conf.level")
#> [1] 0.95
print("t-test P1 weighted jaccard")
#> [1] "t-test P1 weighted jaccard"
temp <- t.test(x=df_2d[which_list$Z_P1_weighted,]$jaccard, conf.level = 0.95)
temp$estimate
#> mean of x
#> 0.807743
temp$conf.int
#> [1] 0.7983065 0.8171795
#> attr(,"conf.level")
#> [1] 0.95
if(TRUE){
### Matrix built from Z score first then add columns of change in other scores
## NOT going to be general to arbitrary sets of matrix measures -> use only Z score and then concatenate other measures
spec_mats <- list()
sim_mat <- similarity_func(exp_mat)
sim_tree <- cluster_func(sim_mat)
weights <- get_weights(sim_tree, colnames(exp_mat))
flat <- weights; flat[] <- 1
for(measure in specificity_measures$func_names){
specificity_func <- specificity_measures$funcs[[measure]]
if(!is.function(specificity_func)){print(paste(measure, "func is not a function")); next;}
if(specificity_measures$out_type[measure]=="matrix"){
spec_mat_flat <- specificity_func(exp_mat, flat)
spec_mat_weighted <- specificity_func(exp_mat, weights)
} else if (specificity_measures$out_type[measure]=="vector"){
spec_mat_flat <- as.matrix(specificity_func(exp_mat, flat))
rownames(spec_mat_flat) <- rownames(exp_mat)
spec_mat_weighted <- as.matrix(specificity_func(exp_mat, weights))
rownames(spec_mat_weighted) <- rownames(exp_mat)
} else { stop("WARNING: no out type associated with measure") }
spec_mats[[paste("delta", measure,sep = "_")]] <- spec_mat_weighted - spec_mat_flat
spec_mats[[paste("weighted", measure,sep = "_")]] <- spec_mat_weighted
spec_mats[[paste("flat", measure,sep = "_")]] <- spec_mat_flat
}
## look at top n and bottom n genes with greatest change in specificity score between weighted and flat
gene_list <- list(up=list(),down=list())
for(spec_mat_name in names(spec_mats)[grep("delta", names(spec_mats))] ){
spec_mat <- spec_mats[[spec_mat_name]]
measure <- str_split_fixed(spec_mat_name, "_", 2)[,2]
## for each tissue for matrix measures
if(specificity_measures$out_type[measure]=="matrix"){
n=5 ## top n for each tissue (column) of matrix
temp_name <- measure
gene_list$up[[temp_name]] <- data.frame("gene"=numeric())
gene_list$down[[temp_name]] <- data.frame("gene"=numeric())
## go in same order as similarity tree
for(i in get_leaves_attr(sim_tree, "label")){
gene_list$up[[temp_name]] <- rbind(gene_list$up[[temp_name]],
data.frame("gene"=rownames(spec_mat_flat)[order((spec_mat[,i]), decreasing = TRUE)[1:n]]))
}
for(i in get_leaves_attr(sim_tree, "label")){
gene_list$down[[temp_name]] <- rbind(gene_list$down[[temp_name]],
data.frame("gene" = rownames(spec_mat_flat)[order((spec_mat[,i]*(-1)), decreasing = TRUE)[1:n]]))
}
} else if (specificity_measures$out_type[measure]=="vector"){
} else { stop("WARNING: no out type associated with measure") }
}
### gene lists generated now to organize into heatmaps
## set up color functions for heatmaps
col_funs <- list()
for(type in c("delta","flat","weighted")){
col_funs[[paste( type, "Zscore" ,sep="_")]] <-colorRamp2(c(min(c(-2, min(spec_mats[[paste( type, "Zscore" ,sep="_")]], na.rm = T)) ),
-1.99, -1, 0, 1, 1.99,
max(c(2, max( spec_mats$delta_Zscore, na.rm = T) ) )),
c("darkblue", "blue", "#B8B8FF" ,"white", "#FFB8B8", "red","red4"))
}
## get genes in top and bottom (note: genes may be repeated since want to see top n for each group )
gene_vec <- data.frame(gene = character())
for(ud in names(gene_list)){
for(measure in names(gene_list[[ud]])){
gene_vec <- rbind(gene_vec, gene_list[[ud]][[measure]])
}
}
heatmaps <- list()
for(type in c("delta", "flat", "weighted")){
heatmaps[[type]] <- HeatmapList()
for(measure in names(gene_list[["up"]])){
spec_mat_name <- names(spec_mats)[intersect(grep(measure, names(spec_mats)), grep(type, names(spec_mats)))]
spec_mat <- spec_mats[[spec_mat_name ]]
temp_name <- paste(spec_mat_name, sep="_")
if(measure=="Zscore"){
h1 <- Heatmap(spec_mat[gene_vec$gene,], name=temp_name,
row_order = gene_vec$gene, show_row_names = FALSE,
cluster_columns = sim_tree,
column_order = colnames(spec_mat),
col = col_funs[[paste(type,measure, sep="_")]])
} else {
h1 <- Heatmap(spec_mat[gene_vec$gene,], name=temp_name,
row_order = gene_vec$gene, show_row_names = FALSE,
col = col_funs[[paste(type,measure, sep="_")]])
}
if(length(heatmaps[[type]])==0){
heatmaps[[type]] <- h1
} else {
heatmaps[[type]] <- heatmaps[[type]] + h1
}
}
}
heatmaps$delta
heatmaps$flat + heatmaps$weighted
}
head(gtex_rowinfo)
#> Name Description
#> 1 ENSG00000223972.5 DDX11L1
#> 2 ENSG00000227232.5 WASH7P
#> 3 ENSG00000278267.1 MIR6859-1
#> 4 ENSG00000243485.5 MIR1302-2HG
#> 5 ENSG00000237613.2 FAM138A
#> 6 ENSG00000268020.3 OR4G4P
specific_genes <- c( "OLIG1","OLIG2",
"PRSS1", "PTF1A",
"MYH6", "MYL4")
spec_genes_w_ids <- gtex_rowinfo[which( is.element(gtex_rowinfo$Description, specific_genes)),]
spec_genes_w_ids <- spec_genes_w_ids[which(is.element(spec_genes_w_ids$Name, rownames(spec_mats$delta_Zscore))),]
spec_genes_w_ids <- spec_genes_w_ids[match(specific_genes, spec_genes_w_ids$Description),]
spec_genes_w_ids <- merge( data.frame(f_or_w = c("flat","weighted")),spec_genes_w_ids)
spec_genes_w_ids <- spec_genes_w_ids[complete.cases(spec_genes_w_ids),]
col_funs <- list()
for(measure in unique(str_split_fixed(names(spec_mats), "_", 2)[,2] )){
if(measure=="Zscore"){
col_funs[[measure]] <-colorRamp2( c( min(c(min(spec_mats$weighted_Zscore[spec_genes_w_ids$Name,], na.rm = TRUE ),
min(spec_mats$flat_Zscore[spec_genes_w_ids$Name,], na.rm = TRUE),-4)),
# -2, -1.99, 0, 1.99, 2, 3.99 ,
-3, -2, 0, 2, 3, 3.99,
max(c(max( spec_mats$weighted_Zscore[spec_genes_w_ids$Name,], na.rm = TRUE ),
max( spec_mats$flat_Zscore[spec_genes_w_ids$Name,]), na.rm = TRUE),4)),
c("darkblue", "blue", "#aaaaff" ,"white", "#ffaaaa", "red","red4", "grey20"))
#c("darkblue", "blue", "#aaaaff" ,"white", "#ffaaaa", "red","red4"))
} else {
col_funs[[measure]] <-colorRamp2(c( 0, 0.4, 0.6, 0.8, 1),
c("white", "lightgoldenrod1","yellow" ,"red","red4"))
#col_funs[[measure]] <-colorRamp2(c( 0, 0.5, 1),
# c("white","red","red4"))
}
}
heatmaps <- HeatmapList()
for(measure in unique(str_split_fixed(names(spec_mats), "_", 2)[,2] )){
temp_mat <- matrix(nrow=nrow(spec_genes_w_ids),ncol=ncol(spec_mats[[paste("delta",measure, sep = "_")]]))
colnames(temp_mat) <- colnames(spec_mats[[paste("delta",measure, sep = "_")]])
rownames(temp_mat) <- spec_genes_w_ids$Name
for(i in 1:nrow(spec_genes_w_ids)){
temp_mat[i,] <- spec_mats[[paste(spec_genes_w_ids$f_or_w[i], measure, sep = "_")]][spec_genes_w_ids$Name[i],]
}
if(measure=="Zscore"){
split <- rep(1:(nrow(spec_genes_w_ids)/2), each=2)
ra <- rowAnnotation(foo = anno_block(labels = paste(unique(spec_genes_w_ids$Description), "\nW F")))
h1 <- Heatmap(temp_mat, name=measure,
cluster_rows = FALSE, show_row_names = FALSE,
cluster_columns = sim_tree,
column_order = colnames(spec_mats$delta_Zscore),
row_split = split,
left_annotation = ra,
col = col_funs[[measure]]
)
} else {
# h1 <- Heatmap(temp_mat, name=measure,
# cluster_rows = FALSE, show_row_names = TRUE,
# col = col_funs[[measure]])
}
if(length(heatmaps)==0){
heatmaps <- h1
# } else {
# heatmaps <- heatmaps + h1
}
}
draw(heatmaps)
## get particular values reported in paper
measure="Zscore"
temp_mat <- matrix(nrow=nrow(spec_genes_w_ids),ncol=ncol(spec_mats[[paste("delta",measure, sep = "_")]]))
colnames(temp_mat) <- colnames(spec_mats[[paste("delta",measure, sep = "_")]])
rownames(temp_mat) <- spec_genes_w_ids$Name
for(i in 1:nrow(spec_genes_w_ids)){
temp_mat[i,] <- spec_mats[[paste(spec_genes_w_ids$f_or_w[i], measure, sep = "_")]][spec_genes_w_ids$Name[i],]
}
rownames(temp_mat) <- paste(c("flat","weighted"), spec_genes_w_ids$Description)
## gene ontology summaries
library(DOSE)
library(pathview)
library(clusterProfiler)
library(org.Hs.eg.db)
library(tidyverse)
library(topGO)
library(enrichplot)
spec_mat_weighted <- calc_weighted_zscore_matrix(exp_mat, weights = weights)
flat <- weights; flat[1:length(flat)] <- 1
spec_mat_flat <- calc_weighted_zscore_matrix(exp_mat, flat)
cutoff <- 2
which <- which((spec_mat_weighted >= cutoff & spec_mat_flat < cutoff))
allOE_genes<-str_split_fixed(rownames(spec_mat_flat),"[.]",2)[,1]
sigOE_genes <- character()
for(j in 1:ncol(spec_mat_weighted)){
which<-which((spec_mat_flat[,j]<=2.0 & spec_mat_weighted[,j]>=2.0))
which<-as.matrix(which)
which<- rownames(which)
which<-str_split_fixed(which, "[.]", 2)
which<- which[,1]
sigOE_genes <- c(sigOE_genes, which)
}
sigOE_genes<-unique(sigOE_genes)
for(onto in c("BP")){ #},"MF","CC")){
print(onto)
ego <- enrichGO(gene = sigOE_genes,
universe = allOE_genes,
keyType = "ENSEMBL",
OrgDb = org.Hs.eg.db,
ont = onto,
pAdjustMethod = "BH",
qvalueCutoff = 0.05,
readable = TRUE,
pool=TRUE)
n=10
print(dotplot(ego, showCategory = n))
ego <- enrichplot::pairwise_termsim(ego)
print(emapplot(ego, showCategory = n))
}
#> [1] "BP"
#> wrong orderBy parameter; set to default `orderBy = "x"`
cluster_summary <- data.frame(ego)
cluster_summary$frac_terms <- as.numeric(str_split_fixed(cluster_summary$GeneRatio,"/",2)[,1] ) / as.numeric( str_split_fixed(cluster_summary$BgRatio,"/",2)[,1] )
## only 1 similarity function tested for now, can make as list later
similarity_func <- function(exp_mat){calc_dot_product_similarity_matrix(calc_zscore_matrix(exp_mat))}
## only 1 clustering fucntion tested for now, can make as a list later
cluster_func <- function(sim_mat, method="single"){
add_weights(add_dist_to_parent(as.dendrogram(hclust(as.dist(1-sim_mat), method = method) ) ))
}
sim_mat <- similarity_func(exp_mat)
cluster_methods <- c("single","complete","average")
for(cm in cluster_methods){
par(mar = c(16,2,2,2))
sim_tree <- cluster_func(sim_mat = sim_mat, method = cm)
sim_tree %>% plot(main=cm)
}
### to do: justify method for measuring sample similarity - may require comparison or refs
calc_dot_product_similarity_matrix <- function(dat) {
dot_product_similarity_matrix <- matrix(0, nrow = ncol(dat), ncol = ncol(dat))
colnames(dot_product_similarity_matrix) <- colnames(dat)
rownames(dot_product_similarity_matrix) <- colnames(dat)
for(i in 1:ncol(dat)){
for(j in 1:ncol(dat)){
which_i <- which(!is.na(dat[,i])) ## ignore NAs
which_j <- which(!is.na(dat[,j])) ## ignore NAs
dot_product_similarity_matrix[i,j] <- sum(dat[which_i,i] * dat[which_j,j]) / (norm(dat[which_i,i],"2")*norm(dat[which_j,j],"2"))
}
}
return(dot_product_similarity_matrix)
}
sim_mat<- (similarity_func(exp_mat))
sim_mat <- (sim_mat-min(sim_mat,na.rm=T)); sim_mat <- sim_mat/max(sim_mat,na.rm = T) ## coerce domain to [0-1]
sim_tree <- cluster_func(sim_mat)
heatmap(sim_mat, Rowv = rev(sim_tree), Colv = sim_tree, scale = "none", margins=c(11,13))
####ADDED
calc_eucl_matrix<- function(dat) {
eucl<- matrix(0, nrow = ncol(dat), ncol = ncol(dat))
colnames(eucl) <- colnames(dat)
rownames(eucl) <- colnames(dat)
for(i in 1:ncol(dat)){
for(j in 1:ncol(dat)) {
eucl[i,j]<-sqrt(sum((dat[,i]-dat[,j])^2,na.rm = T ) )
}
}
return(eucl)
}
eucl_matrix<- 1-(calc_eucl_matrix(calc_zscore_matrix(exp_mat)))
eucl_matrix <- (eucl_matrix-min(eucl_matrix,na.rm=T)); eucl_matrix <- eucl_matrix/max(eucl_matrix,na.rm = T) ## coerce domain to [0-1]
sim_tree <- cluster_func(eucl_matrix)
heatmap(eucl_matrix, Rowv = rev(sim_tree), Colv = sim_tree, scale = "none", margins=c(11,13))
####ADDED
calc_manhattan_matrix<- function(dat) {
manhatt<- matrix(0, nrow = ncol(dat), ncol = ncol(dat))
colnames(manhatt) <- colnames(dat)
rownames(manhatt) <- colnames(dat)
for(i in 1:ncol(dat)){
for(j in 1:ncol(dat)) {
manhatt[i,j]<-sum(abs(dat[,i]-dat[,j]),na.rm = T)
}
}
for(i in 1:nrow(manhatt)){
manhatt[,i] <- manhatt[,i] / max(manhatt,na.rm=T )
}
return(manhatt)
}
manhat_matrix<- 1-(calc_manhattan_matrix(calc_zscore_matrix(exp_mat)))
manhat_matrix <- (manhat_matrix-min(manhat_matrix,na.rm=T)); manhat_matrix <- manhat_matrix/max(manhat_matrix,na.rm = T) ## coerce domain to [0-1]
sim_tree <- cluster_func(manhat_matrix)
heatmap(manhat_matrix, Rowv = sim_tree, Colv = rev(sim_tree), scale = "none", margins=c(11,13))
calc_canberra_matrix<- function(dat) {
canberra<- matrix(0, nrow = ncol(dat), ncol = ncol(dat))
colnames(canberra) <- colnames(dat)
rownames(canberra) <- colnames(dat)
for(i in 1:ncol(dat)){
for(j in 1:ncol(dat)) {
canberra[i,j]<- sum(abs(dat[,i] - dat[,j])/ (abs(dat[,i])+abs(dat[,j])) , na.rm = T)
}
}
for(i in 1:nrow(canberra)){
canberra[,i] <- canberra[,i] / max(canberra,na.rm=T )
}
return(canberra)
}
canberra_mat<- 1-(calc_canberra_matrix( calc_zscore_matrix(exp_mat)))
canberra_mat <- (canberra_mat-min(canberra_mat,na.rm=T)); canberra_mat <- canberra_mat/max(canberra_mat,na.rm = T) ## coerce domain to [0-1]
sim_tree <- cluster_func(canberra_mat)
heatmap(canberra_mat, Rowv = sim_tree, Colv = rev(sim_tree), scale = "none", margins=c(11,13))
## import gtex medians data
temp <- tempfile()
download.file("https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz",temp)
gtex <- read.table( temp, skip=2, header = TRUE, sep = "\t")
gtex_rowinfo <- data.frame(Name=gtex$Name, Description=gtex$Description)
rownames(gtex) <- gtex$Name
gtex <- as.matrix(gtex[,3:ncol(gtex)])
unlink(temp); rm(temp)
## import ensembl gene data
ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl", GRCh=37)
genes <- getBM(attributes=c('chromosome_name','start_position','end_position','hgnc_symbol', 'ensembl_gene_id','gene_biotype'),
filters = list('biotype'='protein_coding'),
mart = ensembl, useCache = F)
genes <- genes[which(is.element(genes$chromosome_name, c(1:22, "X", "Y", "MT")) & genes$hgnc_symbol != "" ) ,]
## show mitochondrial genes drive a large part of sample similarity
## Note sum of medians in gtex not quite 1M - likely artifact of taking medians
temp <- data.frame(names=colnames(gtex), total_transcripts=colSums(gtex))
ggplot( temp ,aes(y=total_transcripts, x=names))+
geom_col()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## match genes between gtex and ensembl
gtex_names <- str_split_fixed(gtex_rowinfo$Name, "[.]", 2)[,1]
which <- which(genes$chromosome_name != "MT" )
gtex_cleaned <- gtex[which(is.element(gtex_names, genes$ensembl_gene_id[which])),]
which <- which(genes$chromosome_name == "MT" )
gtex_cleanedMT <- gtex[which(is.element(gtex_names, genes$ensembl_gene_id[which])),]
##non-mitochondrial TPM sum
temp <- data.frame(names=colnames(gtex_cleaned), total_transcripts=colSums(gtex_cleaned))
ggplot( temp ,aes(y=total_transcripts, x=names))+
geom_col()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
##mitochondrial TPM sum
temp <- data.frame(names=colnames(gtex_cleanedMT), total_transcripts=colSums(gtex_cleanedMT))
ggplot( temp ,aes(y=total_transcripts, x=names))+
geom_col()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
rm(gtex_cleanedMT)
##renormalize TPM without mitochondrial genes
for(i in 1:ncol(gtex_cleaned)){
gtex_cleaned[,i] <- (gtex_cleaned[,i]*1e6 / sum(gtex_cleaned[,i]))
}
temp <- data.frame(names=colnames(gtex_cleaned), total_transcripts=colSums(gtex_cleaned))
ggplot( temp ,aes(y=total_transcripts, x=names))+
geom_col()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
gtex <- gtex_cleaned; rm(gtex_cleaned)
exp_mat <- gtex
## log10(TPM+1) transform of data
exp_mat <- log10(exp_mat+1)
## remove mitochondrial contribution
grid.arrange(grobs=grobl_supp, nrow=4, heights=c(5,3,3,3))
## get gene ids (rows from df_1b) that correspond to given GO id/ GO label
## generate small set of brain related GO id/ GO labels
## plot on one v all plot
genes <- getBM(attributes=c('hgnc_symbol', 'ensembl_gene_id','go_id','name_1006'),
mart = ensembl, useCache = F)
#genes <- load("genes_w_go.RData")
allOE_genes<-str_split_fixed(rownames(exp_mat),"[.]",2)[,1]
ggl <- list()
for(measure in specificity_measures$func_names){
temp_df <- df_1b[which( df_1b$measure == measure),]
temp_df_sub <- slice_max(temp_df, order_by=delta, n=nrow(temp_df)/100 )
sigOE_genes <- str_split_fixed(temp_df_sub$Var1,"[.]",2)[,1]
ego <- enrichGO(gene = sigOE_genes,
universe = allOE_genes,
keyType = "ENSEMBL",
OrgDb = org.Hs.eg.db,
ont = "BP",
pAdjustMethod = "BH",
qvalueCutoff = 0.05,
readable = TRUE,
pool=TRUE)
ggl[[measure]] <- ggplot(temp_df, aes(x=all, y=one))+
geom_point(size=0.1)+
geom_point(data=temp_df_sub, aes( x=all, y=one), color="red")+
ggtitle(measure)+
geom_abline(slope=1,intercept=0, color="red", size=0.3)+
theme_bw()+
theme(panel.grid.minor.x = element_blank())
n=10
filename <- paste(figures_dir,"/",measure,"_top1percent_deltas_go.png",sep = "")
ego <- enrichplot::pairwise_termsim(ego)
p1 <- ggl[[measure]]
p2 <- enrichplot::dotplot(ego, showCategory = n, font.size=8, label_format=15)+
theme(legend.position = "none")
p3 <- emapplot(ego, showCategory = n, cex_label_category=0.5, cex_category=0.5, cex_line=0.5 )+
scale_y_discrete(labels=str_wrap(ego@result$Description[1:n], width=15 ))
grid.arrange(p1, p2,p3, ncol=3) #, widths = c(3,3,4))
#dev.off()
}
#> wrong orderBy parameter; set to default `orderBy = "x"`
#> Warning: Removed 25284 rows containing missing values (geom_point).
#> wrong orderBy parameter; set to default `orderBy = "x"`
#> Warning: Removed 602 rows containing missing values (geom_point).
#> wrong orderBy parameter; set to default `orderBy = "x"`
#> Warning: Removed 602 rows containing missing values (geom_point).
#> wrong orderBy parameter; set to default `orderBy = "x"`
#> Warning: Removed 602 rows containing missing values (geom_point).
## dot similarity from initial Z scores
dot_sim <- similarity_func(exp_mat)
sim_tree <- cluster_func(dot_sim)
heatmap(dot_sim, Rowv = rev(sim_tree), Colv = sim_tree, scale = "none", margins=c(11,13), symm=T)
sim_tree <- cluster_func(dot_sim)
## plot similarity tree with weights
par(mar=c(2,2,2,15))
sim_tree %>%
dendextend::set("nodes_pch",19) %>%
dendextend::set("nodes_cex",2* sqrt(get_nodes_attr(sim_tree,"weight"))) %>%
plot(horiz=T)
## take a look at the weights
temp <- get_weights(sim_tree, get_leaves_attr(sim_tree, "label"))
temp <- data.frame(weights=temp, names=factor(names(temp), levels=rev(names(temp)) ) )
g <- ggplot( temp ,aes(y=fct_rev(as.factor(names)), x=weights))+
geom_col() #+
#theme(axis.text.y = element_text(angle = 90, vjust = 0.5, hjust=1))
g
weights <- get_weights(sim_tree, colnames(exp_mat))
spec_mat_weighted <- calc_weighted_zscore_matrix(exp_mat, weights = weights)
flat <- weights; flat[1:length(flat)] <- 1
spec_mat_flat <- calc_weighted_zscore_matrix(exp_mat, flat)
## plot to look at global dif between flat and weighted
tau_flat <- calc_weighted_tau(exp_mat, flat)
tau_weighted <- calc_weighted_tau(exp_mat, weights)
tsi_flat <- calc_weighted_tsi(exp_mat,flat)
tsi_weighted <- calc_weighted_tsi(exp_mat,weights)
gini_flat <- calc_weighted_gini(exp_mat, flat)
gini_weighted <- calc_weighted_gini(exp_mat, weights)
hist(spec_mat_flat ,breaks=100)
hist(spec_mat_weighted ,breaks=100)
hist(tau_flat ,breaks=100)
hist(tau_weighted ,breaks=100)
hist(tsi_flat ,breaks=100)
hist(tsi_weighted ,breaks=100)
hist(gini_flat ,breaks=100)
hist(gini_weighted ,breaks=100)
hist(spec_mat_weighted - spec_mat_flat ,breaks=100)
hist(tau_weighted - tau_flat ,breaks=100)
hist(tsi_weighted - tsi_flat ,breaks=100)
hist(gini_weighted - gini_flat ,breaks=100)
## load data instead
if(TRUE){ ## NOTE! This chunk can require a lot of memory and time depending if running sequential. If you want to look at robustness test results it is recommended you have enabled parallel computing (e.g. check getDoParWorkers > 3 or 4 (preferably more))
## Use this variable to control whether performing random or true brain-non-brain partition
random_partition = FALSE
robustness_test_results <- list( )
for(i in 1:length(specificity_measures$func_names)){
el_names <- paste( c( "P1_", "P2_")
, specificity_measures$func_names[i], sep="")
temp_list <- setNames(list(list(), list()), nm=el_names )
robustness_test_results <- c(robustness_test_results, temp_list )
}
num_brain_samples <- length(colnames(exp_mat)[grep("[Bb]rain",colnames(exp_mat))])
num_reps <- 3
## use general similarity_func and cluster_func in case we want to test more later
similarity_func <- function(exp_mat){calc_dot_product_similarity_matrix(calc_zscore_matrix(exp_mat))}
cluster_func <- function(sim_mat){add_weights(add_dist_to_parent(as.dendrogram(hclust(as.dist(1-sim_mat), method = "single") ) ))}
## to switch analysis between True Brain Partition and equalivalent sized Random Partition set random_partition ##
if(random_partition){
which <- sample(1:ncol(exp_mat), length(grep("[Bb]rain",colnames(exp_mat),invert = TRUE)))
P1 <- as.matrix(exp_mat[, which])
notP1 <- as.matrix(exp_mat[, which(!is.element(1:ncol(exp_mat),which ))])
}else{
## P1 and notP1 are contant throughout, so set these outside loops
P1 <- as.matrix(exp_mat[,grep("[Bb]rain",colnames(exp_mat), invert = TRUE)])
notP1 <- as.matrix(exp_mat[,grep("[Bb]rain",colnames(exp_mat), invert = FALSE)])
}
### define 1-n trajectories ahead of time since i+1 should be referenced against i.
## do foreach over each element of permuation matrix ## use arrayInd to get row/col given perm number
permutation_matrix <- matrix(nrow = num_brain_samples, ncol = num_reps )
for(rep in 1:num_reps){
permutation_matrix[,rep] <- sample(colnames(notP1),num_brain_samples, replace = FALSE)
}
if(!random_partition){
load("final_permutation_matrix") ## reproduce same used in paper figures
}
if(random_partition){
load("final_random_permutation_matrix") ## reproduce same used in paper figures
}
for(measure in specificity_measures$func_names){
specificity_func <- specificity_measures$funcs[[measure]]
if(!is.function(specificity_func)){print(paste(measure, "func is not a function")); next;}
results <- foreach(perm=1:length(permutation_matrix), .errorhandling = "pass", .final = function(x) setNames(x,paste("rep=", arrayInd(1:length(permutation_matrix), dim(permutation_matrix))[,2], ",n=", arrayInd(1:length(permutation_matrix), dim(permutation_matrix ))[,1], sep = ""))) %dopar% {
library(Hmisc)
library(dendextend)
library(reldist)
sample_indices <- 1:(arrayInd(perm, dim(permutation_matrix))[,1]) ## row is sample name, all names in a row up to top are choices for n=1,2..i
rep_index <- arrayInd(perm, dim(permutation_matrix))[,2] ## column is replicate number
sample_set <- permutation_matrix[sample_indices, rep_index]
# P1_baseline is just P1 at n=1
P2 <- notP1[,sample_set, drop=F]; colnames(P2) <- make.unique(colnames(P2)) ## make.unique will facilitate sampling with replacement
P1uP2 <- cbind(P1,P2)
## P1uP2 ## exp_mat > get sim mat > get sim tree > get samp weights > get spec mat
sim_mat <- similarity_func(P1uP2)
sim_tree <- cluster_func(sim_mat)
weights <- get_weights(sim_tree, colnames(P1uP2))
flat <- weights; flat[] <- 1
specificity_flat <- specificity_func(P1uP2, flat)
specificity_weighted <- specificity_func(P1uP2, weights)
# need to fill in these with spec_score matrices for each n for each method
# names should match
### restore once error fixed
if(specificity_measures$out_type[[measure]]=="matrix"){
result <- list(
P1_weighted_spec_score = specificity_weighted[,colnames(P1), drop=F],
P1_flat_spec_score = specificity_flat[,colnames(P1), drop=F],
P2_weighted_spec_score = specificity_weighted[,colnames(P2), drop=F],
P2_flat_spec_score = specificity_flat[,colnames(P2), drop=F]
)
}else if(specificity_measures$out_type[[measure]]=="vector"){
result <- list(P1_weighted_spec_score = specificity_weighted,
P1_flat_spec_score = specificity_flat,
P2_weighted_spec_score = specificity_weighted,
P2_flat_spec_score = specificity_flat
)
}
}
results_P1 <- list()
results_P2 <- list()
for(i in 1:length(results)){
name <- names(results)[i]
rep <- str_split_fixed(names(results)[1], "[=,]", n=4)[,2]
n <- str_split_fixed(names(results)[1], "[=,]", n=4)[,4]
results_P1[[name]] <- list(P1_weighted_spec_score = results[[name]]$P1_weighted_spec_score, P1_flat_spec_score = results[[name]]$P1_flat_spec_score)
results[[name]]$P1_weighted_spec_score <- NULL; results[[name]]$P1_flat_spec_score <- NULL; ## large object so delete as we go to avoid copy
results_P2[[name]] <- list(P2_weighted_spec_score = results[[name]]$P2_weighted_spec_score, P2_flat_spec_score = results[[name]]$P2_flat_spec_score)
results[[name]]$P2_weighted_spec_score <- NULL; results[[name]]$P2_flat_spec_score<- NULL; ## large object so delete as we go to avoid copy
}
robustness_test_results[[paste("P1_", measure,sep = "")]] <- results_P1; rm(results_P1)
robustness_test_results[[paste("P2_", measure,sep = "")]] <- results_P2; rm(results_P2)
}
} ## end of chunk control
## output to look at is robustness_test_results
#save.image()
df_robustness_test_results <- data.frame(measure=character(),
P1orP2=character(),
weight_or_flat=character(),
n=numeric(),
rep=numeric(),
variance=numeric()
)
for(el in 1:length(robustness_test_results)){
if(length(robustness_test_results[[el]])==0){next;}
temp_name <- str_split_fixed(names(robustness_test_results[el]),"_",2)
temp_len <- length(robustness_test_results[[el]]) ## each has weighted and flat
temp_df <- data.frame(measure=rep(temp_name[2],temp_len),
P1orP2=rep(temp_name[1],temp_len),
weight_or_flat=character(temp_len),
n=numeric(temp_len),
rep=numeric(temp_len),
variance=numeric(temp_len))
temp_df_list <- list("weighted"=temp_df, "flat"=temp_df)
temp_df_list[["weighted"]]$weight_or_flat <- "weighted"
temp_df_list[["flat"]]$weight_or_flat <- "flat"
## build the temp_dfs in these loops
for(i in 1:length(robustness_test_results[[el]])){
for(f_or_w in c("flat","weighted")){
temp_name <- str_split_fixed(names(robustness_test_results[[el]][i]),"[,=]",4)
temp_df_list[[f_or_w]]$rep[i] <- as.numeric(temp_name[,grep("rep", temp_name)+1])
temp_df_list[[f_or_w]]$n[i] <- as.numeric(temp_name[,grep("n", temp_name)+1])
temp <- robustness_test_results[[el]][[i]]
temp <- temp[[grep(f_or_w, names(temp))]]
## baseline matched on replicate, and is set at n=1
temp_baseline <- robustness_test_results[[el]][[paste("rep=",temp_name[,grep("rep", temp_name)+1],",n=1",sep="")]]
temp_baseline <- temp_baseline[[grep(f_or_w, names(temp_baseline))]]
temp_df_list[[f_or_w]]$variance[i] <- var(as.vector(as.matrix(temp)) - as.vector(as.matrix(temp_baseline)), na.rm=T)
}
}
## compile temp_dfs into final result
df_robustness_test_results <- rbind(df_robustness_test_results, temp_df_list[["weighted"]], temp_df_list[["flat"]])
}
df_2c <- df_robustness_test_results
df_2c_summary_by_partition <- aggregate(df_2c$variance,
by=list(measure=df_2c$measure,
P1orP2=df_2c$P1orP2,
weight_or_flat= df_2c$weight_or_flat,
n=df_2c$n),
FUN=function(x) mean(x,na.rm=TRUE))
names(df_2c_summary_by_partition)[ncol(df_2c_summary_by_partition)] <- "mean_var"
df_2c_summary_by_partition$sd_of_var <- aggregate(df_2c$variance,
by=list(measure=df_2c$measure,
P1orP2=df_2c$P1orP2,
weight_or_flat= df_2c$weight_or_flat,
n=df_2c$n),
FUN=function(x) sd(x,na.rm=TRUE))$x
temp <- df_2c_summary_by_partition
df_2c_summary_by_partition$y_min <- temp$mean_var-temp$sd_of_var/sqrt(num_reps)
df_2c_summary_by_partition$y_max <- temp$mean_var+temp$sd_of_var/sqrt(num_reps)
ggl <- list()
## Zscore first
for(measure in c("Zscore", "notZscore")){
if(measure == "Zscore"){
which <- which(df_2c_summary_by_partition$measure == measure)
which2 <- which(df_robustness_test_results$measure == measure)
} else {
which <- which(df_2c_summary_by_partition$measure != "Zscore")
which2 <- which(df_robustness_test_results$measure != "Zscore")
}
gg <- ggplot(df_2c_summary_by_partition[which,], aes(x=n, y=mean_var)) +
scale_color_manual(values=c("grey20","steelblue3"))+
scale_fill_manual(values=c("grey20","steelblue3"))+
geom_ribbon(aes( group=weight_or_flat, ymin=y_min, ymax=y_max, fill=weight_or_flat),alpha=0.5)+
geom_line(aes( group=weight_or_flat, ymin=y_min, ymax=y_max, color=weight_or_flat, fill=weight_or_flat))+
geom_point(data = df_robustness_test_results[which2,], size=0.2,
aes(x=n, y=variance,group=weight_or_flat,color=weight_or_flat, fill=weight_or_flat))+
theme_bw()+
theme(panel.grid.minor.x = element_blank())+
scale_x_continuous(limits = c(0,13), breaks=seq(0,12,by=1), expand = c(0,0))
if(measure == "Zscore"){
gg <- gg+facet_wrap(measure ~ P1orP2 , scales = "free",
labeller=labeller(P1orP2=c("P1"="P1 (non-brain)", "P2"="P2 (brain)")))
gg <- gg+theme(legend.position = "none")
} else {
gg <- gg+facet_wrap( ~ measure, ncol = 3)
gg <- gg+theme(legend.position="bottom")
}
ggl[[measure]] <- gg
}
#> Warning: Ignoring unknown aesthetics: ymin, ymax, fill
#> Warning: Ignoring unknown aesthetics: ymin, ymax, fill
grobl <- list()
for(n in names(ggl)){
grobl[[n]] <- ggplotGrob(ggl[[n]])
}
grid.arrange(grobs=grobl, nrow=2)
### stats reported in paper
## paired t-test for difference in means
### t = delta / sd / sqrt(n)
which_list <- list()
which_list[["Z_P1_flat"]] <- which(df_2c$measure=="Zscore"
& df_2c$P1orP2=="P1"
& df_2c$n == max(df_2c$n)
& df_2c$weight_or_flat=="flat")
which_list[["Z_P1_weighted"]] <- which(df_2c$measure=="Zscore"
& df_2c$P1orP2=="P1"
& df_2c$n == max(df_2c$n)
& df_2c$weight_or_flat=="weighted")
which_list[["Z_P2_flat"]] <- which(df_2c$measure=="Zscore"
& df_2c$P1orP2=="P2"
& df_2c$n == max(df_2c$n)
& df_2c$weight_or_flat=="flat")
which_list[["Z_P2_weighted"]] <- which(df_2c$measure=="Zscore"
& df_2c$P1orP2=="P2"
& df_2c$n == max(df_2c$n)
& df_2c$weight_or_flat=="weighted")
print("t-test P1 weighted / P1 flat : %diff between weighted and flat")
#> [1] "t-test P1 weighted / P1 flat : %diff between weighted and flat"
## t.test on logs gives ratio estimates
temp <- t.test(x=log10(df_2c[which_list$Z_P1_weighted,]$variance),y=log10(df_2c[which_list$Z_P1_flat,]$variance) ,paired=TRUE, conf.level = 0.95)
## antilog to recover ratio of weighted / flat
1-10^temp$estimate
#> mean of the differences
#> 0.7932209
1-10^temp$conf.int
#> [1] 0.8021299 0.7839107
#> attr(,"conf.level")
#> [1] 0.95
### 79.3% lower variance in weighted than flat (CI)
print("t-test P2 weighted / P2 flat : %diff between weighted and flat")
#> [1] "t-test P2 weighted / P2 flat : %diff between weighted and flat"
temp <- t.test(x=log10(df_2c[which_list$Z_P2_weighted,]$variance),y=log10(df_2c[which_list$Z_P2_flat,]$variance) ,paired=TRUE, conf.level = 0.95)
1-10^temp$estimate
#> mean of the differences
#> 0.3578047
1-10^temp$conf.int
#> [1] 0.4196920 0.2893173
#> attr(,"conf.level")
#> [1] 0.95
###
## jaccard between n = 1 and n = num_brain_samples for brain v non-brain OR P1 v P2
df_2d <- data.frame(measure=character(), n=numeric(),
rep=numeric(), cut_point=numeric(),
jaccard=numeric(), P1orP2=character(),
weighted=logical(), samp=character(),
gene_count=numeric(), gene_count_baseline=numeric(),
gene_count_intersect=numeric())
## nested for loop builds df of jaccard results for plotting
Sys.time()
#> [1] "2022-01-02 13:47:37 PST"
for(el in names(robustness_test_results)){
if(length(robustness_test_results[[el]])==0 ){next;}
P1orP2 <- str_split_fixed(el,"_",n=2)[1]
measure <- str_split_fixed(el,"_",n=2)[2]
if(measure == "Zscore"){cuts <- seq(0,4,0.5) ## set cutoffs
}else{ cuts <- seq(0,1,0.05)}
# for(i in 1:length(robustness_test_results[[el]]) ){ ### foreach at this level if implementing foreach
df_temp <- foreach(i=1:length(robustness_test_results[[el]]), .errorhandling = "pass", .combine = rbind) %dopar% {
#foreach(perm=1:length(permutation_matrix), .errorhandling = "pass", .final = function(x) setNames(x,paste("rep=", arrayInd(1:length(permutation_matrix), dim(permutation_matrix))[,2], ",n=", arrayInd(1:length(permutation_matrix), dim(permutation_matrix ))[,1], sep = ""))) %dopar% {
library(stringr)
df_internal <- data.frame(measure=character(), n=numeric(),
rep=numeric(), cut_point=numeric(),
jaccard=numeric(), P1orP2=character(),
weighted=logical(), samp=character(),
gene_count=numeric(), gene_count_baseline=numeric(),
gene_count_intersect=numeric())
n = as.numeric(str_split_fixed(names(robustness_test_results[[el]][i]), "[=,]", n=4 )[,4])
rep = as.numeric(str_split_fixed(names(robustness_test_results[[el]][i]), "[=,]", n=4 )[,2])
for(weighted_or_flat in c("weighted","flat")){
baseline_name <- names(robustness_test_results[[el]])[grep(paste("rep=",rep,",n=",1,"$", sep = ""),names(robustness_test_results[[el]]) )]
if(specificity_measures$out_type[measure] == "matrix"){
weighted_flat_index <- grep(weighted_or_flat,names(robustness_test_results[[el]][[baseline_name]]))
baseline_mat <- robustness_test_results[[el]][[baseline_name]][[weighted_flat_index]]
weighted_flat_index <- grep(weighted_or_flat,names(robustness_test_results[[el]][[i]]))
temp_mat <- robustness_test_results[[el]][[i]][[weighted_flat_index]][,colnames(baseline_mat),drop=FALSE]
}else if(specificity_measures$out_type[measure] == "vector"){
weighted_flat_index <- grep(weighted_or_flat,names(robustness_test_results[[el]][[baseline_name]]))
baseline_mat <- as.matrix(robustness_test_results[[el]][[baseline_name]][[weighted_flat_index]])
weighted_flat_index <- grep(weighted_or_flat,names(robustness_test_results[[el]][[i]]))
temp_mat <- as.matrix(robustness_test_results[[el]][[i]][[weighted_flat_index]])
}else{ stop("WARNING: no out type associated with measure") }
ncols <- ncol(temp_mat); ncuts <- length(cuts)
temp_df <- data.frame(measure=rep(measure, ncols*ncuts), n=rep(n, ncols*ncuts),
rep=rep(rep, ncols*ncuts), cut_point=numeric(length=ncols*ncuts),
jaccard=numeric(length=ncols*ncuts), P1orP2=rep(P1orP2, ncols*ncuts),
weighted=rep( grepl("weighted", weighted_or_flat), ncols*ncuts),
samp=character(length=ncols*ncuts),
gene_count=numeric(length=ncols*ncuts), gene_count_baseline=numeric(length=ncols*ncuts),
gene_count_intersect=numeric(length=ncols*ncuts) )
for(s in 1:ncol(temp_mat) ){ ## add a row to df for each sample
for(cut_index in 1:length(cuts)){
ind <- (s-1)*length(cuts)+cut_index ### use arr_ind here instead
temp_df[ind,]$cut_point <- cuts[cut_index]
if(cuts[cut_index]<0){
temp_df[ind,]$jaccard <- ( length( which( baseline_mat[,s] <= cuts[cut_index] & temp_mat[,s] <= cuts[cut_index] )) /
length( which( baseline_mat[,s] <= cuts[cut_index] | temp_mat[,s] <= cuts[cut_index] )) )
temp_df[ind,]$gene_count <- length( which(temp_mat[,s] <= cuts[cut_index] ) )
temp_df[ind,]$gene_count_baseline <- length( which(baseline_mat[,s] <= cuts[cut_index] ) )
temp_df[ind,]$gene_count_intersect <- length( which( baseline_mat[,s] <= cuts[cut_index] & temp_mat[,s] <= cuts[cut_index] ))
} else {
temp_df[ind,]$jaccard <- ( length( which( baseline_mat[,s] >= cuts[cut_index] & temp_mat[,s] >= cuts[cut_index] )) /
length( which( baseline_mat[,s] >= cuts[cut_index] | temp_mat[,s] >= cuts[cut_index] )) )
temp_df[ind,]$gene_count <- length( which(temp_mat[,s] >= cuts[cut_index] ) )
temp_df[ind,]$gene_count_baseline <- length( which(baseline_mat[,s] >= cuts[cut_index] ) )
temp_df[ind,]$gene_count_intersect <- length( which( baseline_mat[,s] >= cuts[cut_index] & temp_mat[,s] >= cuts[cut_index] ))
}
temp_df[ind,]$samp <- ifelse(!is.null((colnames(baseline_mat[,s,drop=FALSE] ) )),
unique(colnames(baseline_mat[,s,drop=FALSE] ),
colnames(temp_mat[,s,drop=FALSE] ) ), NA )
}
}
df_internal <- rbind(df_internal, temp_df)
}
df_internal
}
df_2d <- rbind(df_2d, df_temp)
}
Sys.time()
#> [1] "2022-01-02 14:24:56 PST"
df_2d_summary_by_partition <-aggregate(df_2d$jaccard, by=list(measure=df_2d$measure, n=df_2d$n,
cut_point=df_2d$cut_point, P1orP2=df_2d$P1orP2,
weighted=df_2d$weighted),
FUN=function(x) mean(x,na.rm=TRUE) )
names(df_2d_summary_by_partition)[length(names(df_2d_summary_by_partition))] <- "jaccard_mean"
df_2d_summary_by_partition$jaccard_sd <- aggregate(df_2d$jaccard, by=list(measure=df_2d$measure, n=df_2d$n,
cut_point=df_2d$cut_point, P1orP2=df_2d$P1orP2,
weighted=df_2d$weighted) ,
FUN=function(x) sd(x,na.rm = TRUE) )$x
df_2d_summary_by_partition$gene_count <- aggregate(df_2d$gene_count, by=list(measure=df_2d$measure, n=df_2d$n,
cut_point=df_2d$cut_point, P1orP2=df_2d$P1orP2,
weighted=df_2d$weighted)
, FUN=function(x) mean(x,na.rm = TRUE) )$x
df_2d_summary_by_partition$gene_count_baseline <- aggregate(df_2d$gene_count_baseline, by=list(measure=df_2d$measure, n=df_2d$n,
cut_point=df_2d$cut_point, P1orP2=df_2d$P1orP2,
weighted=df_2d$weighted),
FUN=function(x) mean(x,na.rm = TRUE) )$x
df_2d_summary_by_partition$gene_count_intersect <- aggregate(df_2d$gene_count_intersect, by=list(measure=df_2d$measure, n=df_2d$n,
cut_point=df_2d$cut_point, P1orP2=df_2d$P1orP2,
weighted=df_2d$weighted),
FUN=function(x) mean(x,na.rm = TRUE) )$x
ggl_jacc <- list()
for(measure in c("Zscore", "notZscore")){
if(measure == "Zscore"){
which <- which(df_2d_summary_by_partition$measure == measure)
} else {
which <- which(df_2d_summary_by_partition$measure != "Zscore")
}
gg <- ggplot(df_2d_summary_by_partition[which,],
aes(x=cut_point, y=jaccard_mean,
ymin=jaccard_mean-jaccard_sd/num_reps,
ymax=jaccard_mean+jaccard_sd/num_reps,
group=n, fill=n))+
theme_bw()+
theme(
panel.grid.minor.x = element_blank())+
#ggtitle(paste("jaccard index for ", measure, " P1 and P2, weighted and flat measures",sep=""))+
geom_line(aes(color=n))+
scale_color_continuous(low="powderblue", high="steelblue4")+
scale_y_continuous(limits = c(0,1), breaks=seq(0,1,by=0.2))+
facet_grid(P1orP2 ~ weighted, labeller=labeller(weighted=c("TRUE"="weighted", "FALSE"="flat")) )
if(measure == "Zscore"){
gg <- gg+scale_x_continuous(limits = c(0,4), breaks=seq(0,4,by=0.5), expand = c(0,0))
gg <- gg + facet_grid(weighted ~ P1orP2, labeller=labeller(weighted=c("TRUE"="weighted", "FALSE"="flat")) )
} else {
gg <- gg+scale_x_continuous(limits = c(0,1), breaks=seq(0,1,by=0.1) ,expand = c(0,0))
gg <- gg + facet_grid(weighted ~ measure, labeller=labeller(weighted=c("TRUE"="weighted", "FALSE"="flat")) )
}
ggl_jacc[[measure]] <- gg
}
grobl <- list()
for(n in names(ggl_jacc)){
grobl[[n]] <- ggplotGrob(ggl_jacc[[n]])
}
grid.arrange(grobs =list(grobl$Zscore,grobl$notZscore), nrow=2)
## for random supplemental
if(random_partition){
grid.arrange(grobs = grobl, nrow=2, heights=c(5,4) )
}
ggl_count <- list()
for(measure in c("Zscore", "notZscore")){
if(measure == "Zscore"){
which <- which(df_2d_summary_by_partition$measure == measure)
} else {
which <- which(df_2d_summary_by_partition$measure != "Zscore")
}
gg <- ggplot(df_2d_summary_by_partition[which,],
aes(x=cut_point, y=gene_count / nrow(genes),
group=n, fill=n))+
theme_bw()+
theme(
panel.grid.minor.x = element_blank())+
geom_line(aes(color=n))+
scale_color_continuous(low="steelblue2", high="steelblue4")+
#scale_y_log10( #limits = c(1,3e4),
scale_y_continuous( limits=c(1e-4,1) , trans = log10_trans(),
breaks = trans_breaks("log10", function(x) 10^x), # ) #,
labels = percent, )
if(measure == "Zscore"){
gg <- gg+scale_x_continuous(limits = c(0,4), breaks=seq(0,4,by=0.5), expand = c(0,0))
gg <- gg + facet_grid(weighted ~ P1orP2, labeller=labeller(weighted=c("TRUE"="weighted", "FALSE"="flat")) )
} else {
gg <- gg+scale_x_continuous(limits = c(0,1), breaks=seq(0,1,by=0.1) ,expand = c(0,0))
gg <- gg + facet_grid(weighted ~ measure, labeller=labeller(weighted=c("TRUE"="weighted", "FALSE"="flat")) )
}
ggl_count[[measure]] <- gg
}
grobl <- list()
for(n in names(ggl_count)){
grobl[[n]] <- ggplotGrob(ggl_count[[n]])
}
#> Warning: Transformation introduced infinite values in continuous y-axis
grid.arrange(grobs = list(grobl$Zscore, grobl$notZscore), nrow=2)
## for random supplemental
if(random_partition){
grid.arrange(grobs = grobl, nrow=2, heights=c(5,4) )
}
### stats reported in paper
## paired t-test for difference in means
### t = delta / sd / sqrt(n)
which_list <- list()
which_list[["Z_P1_flat"]] <- which(df_2d$measure=="Zscore"
& df_2d$n==max(df_2d$n)
& df_2d$cut_point==2
& df_2d$P1orP2=="P1"
& df_2d$weighted==FALSE)
which_list[["Z_P1_weighted"]] <- which(df_2d$measure=="Zscore"
& df_2d$n==max(df_2d$n)
& df_2d$cut_point==2
& df_2d$P1orP2=="P1"
& df_2d$weighted==TRUE)
which_list[["Z_P2_flat"]] <- which(df_2d$measure=="Zscore"
& df_2d$n==max(df_2d$n)
& df_2d$cut_point==2
& df_2d$P1orP2=="P2"
& df_2d$weighted==FALSE)
which_list[["Z_P2_weighted"]] <- which(df_2d$measure=="Zscore"
& df_2d$n==max(df_2d$n)
& df_2d$cut_point==2
& df_2d$P1orP2=="P2"
& df_2d$weighted==TRUE)
print("t-test P2 flat jaccard")
#> [1] "t-test P2 flat jaccard"
temp <- t.test(x=df_2d[which_list$Z_P2_flat,]$jaccard, conf.level = 0.95)
temp$estimate
#> mean of x
#> 0.2513928
temp$conf.int
#> [1] 0.1101765 0.3926090
#> attr(,"conf.level")
#> [1] 0.95
print("t-test P2 weighted jaccard")
#> [1] "t-test P2 weighted jaccard"
temp <- t.test(x=df_2d[which_list$Z_P2_weighted,]$jaccard, conf.level = 0.95)
temp$estimate
#> mean of x
#> 0.6359852
temp$conf.int
#> [1] 0.5740172 0.6979532
#> attr(,"conf.level")
#> [1] 0.95
print("t-test P1 flat jaccard")
#> [1] "t-test P1 flat jaccard"
temp <- t.test(x=df_2d[which_list$Z_P1_flat,]$jaccard, conf.level = 0.95)
temp$estimate
#> mean of x
#> 0.6495905
temp$conf.int
#> [1] 0.6367873 0.6623936
#> attr(,"conf.level")
#> [1] 0.95
print("t-test P1 weighted jaccard")
#> [1] "t-test P1 weighted jaccard"
temp <- t.test(x=df_2d[which_list$Z_P1_weighted,]$jaccard, conf.level = 0.95)
temp$estimate
#> mean of x
#> 0.807743
temp$conf.int
#> [1] 0.7983065 0.8171795
#> attr(,"conf.level")
#> [1] 0.95
if(TRUE){
### Matrix built from Z score first then add columns of change in other scores
## NOT going to be general to arbitrary sets of matrix measures -> use only Z score and then concatenate other measures
spec_mats <- list()
sim_mat <- similarity_func(exp_mat)
sim_tree <- cluster_func(sim_mat)
weights <- get_weights(sim_tree, colnames(exp_mat))
flat <- weights; flat[] <- 1
for(measure in specificity_measures$func_names){
specificity_func <- specificity_measures$funcs[[measure]]
if(!is.function(specificity_func)){print(paste(measure, "func is not a function")); next;}
if(specificity_measures$out_type[measure]=="matrix"){
spec_mat_flat <- specificity_func(exp_mat, flat)
spec_mat_weighted <- specificity_func(exp_mat, weights)
} else if (specificity_measures$out_type[measure]=="vector"){
spec_mat_flat <- as.matrix(specificity_func(exp_mat, flat))
rownames(spec_mat_flat) <- rownames(exp_mat)
spec_mat_weighted <- as.matrix(specificity_func(exp_mat, weights))
rownames(spec_mat_weighted) <- rownames(exp_mat)
} else { stop("WARNING: no out type associated with measure") }
spec_mats[[paste("delta", measure,sep = "_")]] <- spec_mat_weighted - spec_mat_flat
spec_mats[[paste("weighted", measure,sep = "_")]] <- spec_mat_weighted
spec_mats[[paste("flat", measure,sep = "_")]] <- spec_mat_flat
}
## look at top n and bottom n genes with greatest change in specificity score between weighted and flat
gene_list <- list(up=list(),down=list())
for(spec_mat_name in names(spec_mats)[grep("delta", names(spec_mats))] ){
spec_mat <- spec_mats[[spec_mat_name]]
measure <- str_split_fixed(spec_mat_name, "_", 2)[,2]
## for each tissue for matrix measures
if(specificity_measures$out_type[measure]=="matrix"){
n=5 ## top n for each tissue (column) of matrix
temp_name <- measure
gene_list$up[[temp_name]] <- data.frame("gene"=numeric())
gene_list$down[[temp_name]] <- data.frame("gene"=numeric())
## go in same order as similarity tree
for(i in get_leaves_attr(sim_tree, "label")){
gene_list$up[[temp_name]] <- rbind(gene_list$up[[temp_name]],
data.frame("gene"=rownames(spec_mat_flat)[order((spec_mat[,i]), decreasing = TRUE)[1:n]]))
}
for(i in get_leaves_attr(sim_tree, "label")){
gene_list$down[[temp_name]] <- rbind(gene_list$down[[temp_name]],
data.frame("gene" = rownames(spec_mat_flat)[order((spec_mat[,i]*(-1)), decreasing = TRUE)[1:n]]))
}
} else if (specificity_measures$out_type[measure]=="vector"){
} else { stop("WARNING: no out type associated with measure") }
}
### gene lists generated now to organize into heatmaps
## set up color functions for heatmaps
col_funs <- list()
for(type in c("delta","flat","weighted")){
col_funs[[paste( type, "Zscore" ,sep="_")]] <-colorRamp2(c(min(c(-2, min(spec_mats[[paste( type, "Zscore" ,sep="_")]], na.rm = T)) ),
-1.99, -1, 0, 1, 1.99,
max(c(2, max( spec_mats$delta_Zscore, na.rm = T) ) )),
c("darkblue", "blue", "#B8B8FF" ,"white", "#FFB8B8", "red","red4"))
}
## get genes in top and bottom (note: genes may be repeated since want to see top n for each group )
gene_vec <- data.frame(gene = character())
for(ud in names(gene_list)){
for(measure in names(gene_list[[ud]])){
gene_vec <- rbind(gene_vec, gene_list[[ud]][[measure]])
}
}
heatmaps <- list()
for(type in c("delta", "flat", "weighted")){
heatmaps[[type]] <- HeatmapList()
for(measure in names(gene_list[["up"]])){
spec_mat_name <- names(spec_mats)[intersect(grep(measure, names(spec_mats)), grep(type, names(spec_mats)))]
spec_mat <- spec_mats[[spec_mat_name ]]
temp_name <- paste(spec_mat_name, sep="_")
if(measure=="Zscore"){
h1 <- Heatmap(spec_mat[gene_vec$gene,], name=temp_name,
row_order = gene_vec$gene, show_row_names = FALSE,
cluster_columns = sim_tree,
column_order = colnames(spec_mat),
col = col_funs[[paste(type,measure, sep="_")]])
} else {
h1 <- Heatmap(spec_mat[gene_vec$gene,], name=temp_name,
row_order = gene_vec$gene, show_row_names = FALSE,
col = col_funs[[paste(type,measure, sep="_")]])
}
if(length(heatmaps[[type]])==0){
heatmaps[[type]] <- h1
} else {
heatmaps[[type]] <- heatmaps[[type]] + h1
}
}
}
heatmaps$delta
heatmaps$flat + heatmaps$weighted
}
head(gtex_rowinfo)
#> Name Description
#> 1 ENSG00000223972.5 DDX11L1
#> 2 ENSG00000227232.5 WASH7P
#> 3 ENSG00000278267.1 MIR6859-1
#> 4 ENSG00000243485.5 MIR1302-2HG
#> 5 ENSG00000237613.2 FAM138A
#> 6 ENSG00000268020.3 OR4G4P
specific_genes <- c( "OLIG1","OLIG2",
"PRSS1", "PTF1A",
"MYH6", "MYL4")
spec_genes_w_ids <- gtex_rowinfo[which( is.element(gtex_rowinfo$Description, specific_genes)),]
spec_genes_w_ids <- spec_genes_w_ids[which(is.element(spec_genes_w_ids$Name, rownames(spec_mats$delta_Zscore))),]
spec_genes_w_ids <- spec_genes_w_ids[match(specific_genes, spec_genes_w_ids$Description),]
spec_genes_w_ids <- merge( data.frame(f_or_w = c("flat","weighted")),spec_genes_w_ids)
spec_genes_w_ids <- spec_genes_w_ids[complete.cases(spec_genes_w_ids),]
col_funs <- list()
for(measure in unique(str_split_fixed(names(spec_mats), "_", 2)[,2] )){
if(measure=="Zscore"){
col_funs[[measure]] <-colorRamp2( c( min(c(min(spec_mats$weighted_Zscore[spec_genes_w_ids$Name,], na.rm = TRUE ),
min(spec_mats$flat_Zscore[spec_genes_w_ids$Name,], na.rm = TRUE),-4)),
# -2, -1.99, 0, 1.99, 2, 3.99 ,
-3, -2, 0, 2, 3, 3.99,
max(c(max( spec_mats$weighted_Zscore[spec_genes_w_ids$Name,], na.rm = TRUE ),
max( spec_mats$flat_Zscore[spec_genes_w_ids$Name,]), na.rm = TRUE),4)),
c("darkblue", "blue", "#aaaaff" ,"white", "#ffaaaa", "red","red4", "grey20"))
#c("darkblue", "blue", "#aaaaff" ,"white", "#ffaaaa", "red","red4"))
} else {
col_funs[[measure]] <-colorRamp2(c( 0, 0.4, 0.6, 0.8, 1),
c("white", "lightgoldenrod1","yellow" ,"red","red4"))
#col_funs[[measure]] <-colorRamp2(c( 0, 0.5, 1),
# c("white","red","red4"))
}
}
heatmaps <- HeatmapList()
for(measure in unique(str_split_fixed(names(spec_mats), "_", 2)[,2] )){
temp_mat <- matrix(nrow=nrow(spec_genes_w_ids),ncol=ncol(spec_mats[[paste("delta",measure, sep = "_")]]))
colnames(temp_mat) <- colnames(spec_mats[[paste("delta",measure, sep = "_")]])
rownames(temp_mat) <- spec_genes_w_ids$Name
for(i in 1:nrow(spec_genes_w_ids)){
temp_mat[i,] <- spec_mats[[paste(spec_genes_w_ids$f_or_w[i], measure, sep = "_")]][spec_genes_w_ids$Name[i],]
}
if(measure=="Zscore"){
split <- rep(1:(nrow(spec_genes_w_ids)/2), each=2)
ra <- rowAnnotation(foo = anno_block(labels = paste(unique(spec_genes_w_ids$Description), "\nW F")))
h1 <- Heatmap(temp_mat, name=measure,
cluster_rows = FALSE, show_row_names = FALSE,
cluster_columns = sim_tree,
column_order = colnames(spec_mats$delta_Zscore),
row_split = split,
left_annotation = ra,
col = col_funs[[measure]]
)
} else {
# h1 <- Heatmap(temp_mat, name=measure,
# cluster_rows = FALSE, show_row_names = TRUE,
# col = col_funs[[measure]])
}
if(length(heatmaps)==0){
heatmaps <- h1
# } else {
# heatmaps <- heatmaps + h1
}
}
draw(heatmaps)
## get particular values reported in paper
measure="Zscore"
temp_mat <- matrix(nrow=nrow(spec_genes_w_ids),ncol=ncol(spec_mats[[paste("delta",measure, sep = "_")]]))
colnames(temp_mat) <- colnames(spec_mats[[paste("delta",measure, sep = "_")]])
rownames(temp_mat) <- spec_genes_w_ids$Name
for(i in 1:nrow(spec_genes_w_ids)){
temp_mat[i,] <- spec_mats[[paste(spec_genes_w_ids$f_or_w[i], measure, sep = "_")]][spec_genes_w_ids$Name[i],]
}
rownames(temp_mat) <- paste(c("flat","weighted"), spec_genes_w_ids$Description)
## gene ontology summaries
library(DOSE)
library(pathview)
library(clusterProfiler)
library(org.Hs.eg.db)
library(tidyverse)
library(topGO)
library(enrichplot)
spec_mat_weighted <- calc_weighted_zscore_matrix(exp_mat, weights = weights)
flat <- weights; flat[1:length(flat)] <- 1
spec_mat_flat <- calc_weighted_zscore_matrix(exp_mat, flat)
cutoff <- 2
which <- which((spec_mat_weighted >= cutoff & spec_mat_flat < cutoff))
allOE_genes<-str_split_fixed(rownames(spec_mat_flat),"[.]",2)[,1]
sigOE_genes <- character()
for(j in 1:ncol(spec_mat_weighted)){
which<-which((spec_mat_flat[,j]<=2.0 & spec_mat_weighted[,j]>=2.0))
which<-as.matrix(which)
which<- rownames(which)
which<-str_split_fixed(which, "[.]", 2)
which<- which[,1]
sigOE_genes <- c(sigOE_genes, which)
}
sigOE_genes<-unique(sigOE_genes)
for(onto in c("BP")){ #},"MF","CC")){
print(onto)
ego <- enrichGO(gene = sigOE_genes,
universe = allOE_genes,
keyType = "ENSEMBL",
OrgDb = org.Hs.eg.db,
ont = onto,
pAdjustMethod = "BH",
qvalueCutoff = 0.05,
readable = TRUE,
pool=TRUE)
n=10
print(dotplot(ego, showCategory = n))
ego <- enrichplot::pairwise_termsim(ego)
print(emapplot(ego, showCategory = n))
}
#> [1] "BP"
#> wrong orderBy parameter; set to default `orderBy = "x"`
cluster_summary <- data.frame(ego)
cluster_summary$frac_terms <- as.numeric(str_split_fixed(cluster_summary$GeneRatio,"/",2)[,1] ) / as.numeric( str_split_fixed(cluster_summary$BgRatio,"/",2)[,1] )
## only 1 similarity function tested for now, can make as list later
similarity_func <- function(exp_mat){calc_dot_product_similarity_matrix(calc_zscore_matrix(exp_mat))}
## only 1 clustering fucntion tested for now, can make as a list later
cluster_func <- function(sim_mat, method="single"){
add_weights(add_dist_to_parent(as.dendrogram(hclust(as.dist(1-sim_mat), method = method) ) ))
}
sim_mat <- similarity_func(exp_mat)
cluster_methods <- c("single","complete","average")
for(cm in cluster_methods){
par(mar = c(16,2,2,2))
sim_tree <- cluster_func(sim_mat = sim_mat, method = cm)
sim_tree %>% plot(main=cm)
}
### to do: justify method for measuring sample similarity - may require comparison or refs
calc_dot_product_similarity_matrix <- function(dat) {
dot_product_similarity_matrix <- matrix(0, nrow = ncol(dat), ncol = ncol(dat))
colnames(dot_product_similarity_matrix) <- colnames(dat)
rownames(dot_product_similarity_matrix) <- colnames(dat)
for(i in 1:ncol(dat)){
for(j in 1:ncol(dat)){
which_i <- which(!is.na(dat[,i])) ## ignore NAs
which_j <- which(!is.na(dat[,j])) ## ignore NAs
dot_product_similarity_matrix[i,j] <- sum(dat[which_i,i] * dat[which_j,j]) / (norm(dat[which_i,i],"2")*norm(dat[which_j,j],"2"))
}
}
return(dot_product_similarity_matrix)
}
sim_mat<- (similarity_func(exp_mat))
sim_mat <- (sim_mat-min(sim_mat,na.rm=T)); sim_mat <- sim_mat/max(sim_mat,na.rm = T) ## coerce domain to [0-1]
sim_tree <- cluster_func(sim_mat)
heatmap(sim_mat, Rowv = rev(sim_tree), Colv = sim_tree, scale = "none", margins=c(11,13))
####ADDED
calc_eucl_matrix<- function(dat) {
eucl<- matrix(0, nrow = ncol(dat), ncol = ncol(dat))
colnames(eucl) <- colnames(dat)
rownames(eucl) <- colnames(dat)
for(i in 1:ncol(dat)){
for(j in 1:ncol(dat)) {
eucl[i,j]<-sqrt(sum((dat[,i]-dat[,j])^2,na.rm = T ) )
}
}
return(eucl)
}
eucl_matrix<- 1-(calc_eucl_matrix(calc_zscore_matrix(exp_mat)))
eucl_matrix <- (eucl_matrix-min(eucl_matrix,na.rm=T)); eucl_matrix <- eucl_matrix/max(eucl_matrix,na.rm = T) ## coerce domain to [0-1]
sim_tree <- cluster_func(eucl_matrix)
heatmap(eucl_matrix, Rowv = rev(sim_tree), Colv = sim_tree, scale = "none", margins=c(11,13))
####ADDED
calc_manhattan_matrix<- function(dat) {
manhatt<- matrix(0, nrow = ncol(dat), ncol = ncol(dat))
colnames(manhatt) <- colnames(dat)
rownames(manhatt) <- colnames(dat)
for(i in 1:ncol(dat)){
for(j in 1:ncol(dat)) {
manhatt[i,j]<-sum(abs(dat[,i]-dat[,j]),na.rm = T)
}
}
for(i in 1:nrow(manhatt)){
manhatt[,i] <- manhatt[,i] / max(manhatt,na.rm=T )
}
return(manhatt)
}
manhat_matrix<- 1-(calc_manhattan_matrix(calc_zscore_matrix(exp_mat)))
manhat_matrix <- (manhat_matrix-min(manhat_matrix,na.rm=T)); manhat_matrix <- manhat_matrix/max(manhat_matrix,na.rm = T) ## coerce domain to [0-1]
sim_tree <- cluster_func(manhat_matrix)
heatmap(manhat_matrix, Rowv = sim_tree, Colv = rev(sim_tree), scale = "none", margins=c(11,13))
calc_canberra_matrix<- function(dat) {
canberra<- matrix(0, nrow = ncol(dat), ncol = ncol(dat))
colnames(canberra) <- colnames(dat)
rownames(canberra) <- colnames(dat)
for(i in 1:ncol(dat)){
for(j in 1:ncol(dat)) {
canberra[i,j]<- sum(abs(dat[,i] - dat[,j])/ (abs(dat[,i])+abs(dat[,j])) , na.rm = T)
}
}
for(i in 1:nrow(canberra)){
canberra[,i] <- canberra[,i] / max(canberra,na.rm=T )
}
return(canberra)
}
canberra_mat<- 1-(calc_canberra_matrix( calc_zscore_matrix(exp_mat)))
canberra_mat <- (canberra_mat-min(canberra_mat,na.rm=T)); canberra_mat <- canberra_mat/max(canberra_mat,na.rm = T) ## coerce domain to [0-1]
sim_tree <- cluster_func(canberra_mat)
heatmap(canberra_mat, Rowv = sim_tree, Colv = rev(sim_tree), scale = "none", margins=c(11,13))